From 0efdbf168fc324b0173cbc1a2019c4748d5f312a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 25 Sep 2019 10:08:53 -0700 Subject: [PATCH v25 2/4] Add deduplication to nbtree --- src/include/access/nbtree.h | 329 +++++++- src/include/access/nbtxlog.h | 71 +- src/include/access/rmgrlist.h | 2 +- src/backend/access/common/reloptions.c | 9 + src/backend/access/index/genam.c | 4 + src/backend/access/nbtree/Makefile | 1 + src/backend/access/nbtree/README | 74 +- src/backend/access/nbtree/nbtdedup.c | 715 ++++++++++++++++++ src/backend/access/nbtree/nbtinsert.c | 343 ++++++++- src/backend/access/nbtree/nbtpage.c | 238 +++++- src/backend/access/nbtree/nbtree.c | 167 +++- src/backend/access/nbtree/nbtsearch.c | 250 +++++- src/backend/access/nbtree/nbtsort.c | 204 ++++- src/backend/access/nbtree/nbtsplitloc.c | 36 +- src/backend/access/nbtree/nbtutils.c | 204 ++++- src/backend/access/nbtree/nbtxlog.c | 236 +++++- src/backend/access/rmgrdesc/nbtdesc.c | 25 +- src/backend/utils/misc/guc.c | 28 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/bin/psql/tab-complete.c | 4 +- contrib/amcheck/verify_nbtree.c | 180 ++++- doc/src/sgml/btree.sgml | 123 ++- doc/src/sgml/charset.sgml | 9 +- doc/src/sgml/config.sgml | 33 + doc/src/sgml/maintenance.sgml | 8 + doc/src/sgml/ref/create_index.sgml | 44 +- doc/src/sgml/ref/reindex.sgml | 5 +- src/test/regress/expected/btree_index.out | 16 + src/test/regress/sql/btree_index.sql | 17 + 29 files changed, 3131 insertions(+), 245 deletions(-) create mode 100644 src/backend/access/nbtree/nbtdedup.c diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9833cc10bd..1482d5ab1a 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,17 @@ #include "storage/bufmgr.h" #include "storage/shm_toc.h" +/* deduplication GUC modes */ +typedef enum DeduplicationMode +{ + DEDUP_OFF = 0, /* disabled */ + DEDUP_ON, /* enabled generally */ + DEDUP_NONUNIQUE /* enabled with non-unique indexes only (default) */ +} DeduplicationMode; + +/* GUC parameter */ +extern int btree_deduplication; + /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -108,6 +119,7 @@ typedef struct BTMetaPageData * pages */ float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples * during last cleanup */ + bool btm_safededup; /* deduplication known to be safe? */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -115,7 +127,8 @@ typedef struct BTMetaPageData /* * The current Btree version is 4. That's what you'll get when you create - * a new index. + * a new index. The btm_safededup field can only be set if this happened + * on Postgres 13, but it's safe to read with version 3 indexes. * * Btree version 3 was used in PostgreSQL v11. It is mostly the same as * version 4, but heap TIDs were not part of the keyspace. Index tuples @@ -132,8 +145,8 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number in metapage */ #define BTREE_VERSION 4 /* current version number */ -#define BTREE_MIN_VERSION 2 /* minimal supported version number */ -#define BTREE_NOVAC_VERSION 3 /* minimal version with all meta fields */ +#define BTREE_MIN_VERSION 2 /* minimum supported version */ +#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ /* * Maximum size of a btree index entry, including its tuple header. @@ -155,6 +168,26 @@ typedef struct BTMetaPageData MAXALIGN_DOWN((PageGetPageSize(page) - \ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) +/* + * MaxBTreeIndexTuplesPerPage is an upper bound on the number of "logical" + * tuples that may be stored on a btree leaf page. This is comparable to + * the generic/physical MaxIndexTuplesPerPage upper bound. A separate + * upper bound is needed in certain contexts due to posting list tuples, + * which only use a single physical page entry to store many logical + * tuples. (MaxBTreeIndexTuplesPerPage is used to size the per-page + * temporary buffers used by index scans.) + * + * Note: we don't bother considering per-physical-tuple overheads here to + * keep things simple (value is based on how many elements a single array + * of heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. There will + * only be three (very large) physical posting list tuples in leaf pages + * that have the largest possible number of heap TIDs/logical tuples. + */ +#define MaxBTreeIndexTuplesPerPage \ + (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) /* * The leaf-page fillfactor defaults to 90% but is user-adjustable. @@ -230,16 +263,15 @@ typedef struct BTMetaPageData * tuples (non-pivot tuples). _bt_check_natts() enforces the rules * described here. * - * Non-pivot tuple format: + * Non-pivot tuple format (plain/non-posting variant): * * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * - * All other types of index tuples ("pivot" tuples) only have key columns, - * since pivot tuples only exist to represent how the key space is + * Non-pivot tuples complement pivot tuples, which only have key columns. + * The sole purpose of pivot tuples is to represent how the key space is * separated. In general, any B-Tree index that has more than one level * (i.e. any index that does not just consist of a metapage and a single * leaf root page) must have some number of pivot tuples, since pivot @@ -283,20 +315,103 @@ typedef struct BTMetaPageData * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any * number of columns/attributes <= INDEX_MAX_KEYS. * + * Sometimes non-pivot tuples also use a representation that repurposes + * t_tid to store metadata rather than a TID. Postgres 13 introduced a new + * non-pivot tuple format to support deduplication: posting list tuples. + * Deduplication folds together multiple equal non-pivot tuples into a + * logically equivalent, space efficient representation. A posting list is + * an array of ItemPointerData elements. Regular non-pivot tuples are + * merged together to form posting list tuples lazily, at the point where + * we'd otherwise have to split a leaf page. + * + * Posting tuple format (alternative non-pivot tuple representation): + * + * t_tid | t_info | key values | posting list (TID array) + * + * Posting list tuples are recognized as such by having the + * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status + * bit set in t_tid. These flags redefine the content of the posting + * tuple's t_tid to store an offset to the posting list, as well as the + * total number of posting list array elements. + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items present in the tuple, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. Like any non-pivot tuple, the number of columns stored is + * always implicitly the total number in the index (in practice there can + * never be non-key columns stored, since deduplication is not supported + * with INCLUDE indexes). + * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_N_POSTING_OFFSET_MASK 0x0FFF #define BT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 + +/* + * N.B.: BTreeTupleIsPivot() should only be used in code that deals with + * heapkeyspace indexes specifically. BTreeTupleIsPosting() works with all + * nbtree indexes, though. + */ +#define BTreeTupleIsPivot(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\ + ) +#define BTreeTupleIsPosting(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\ + ) + +#define BTreeTupleClearBtIsPosting(itup) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \ + } while(0) + +#define BTreeTupleGetNPosting(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \ + ) +#define BTreeTupleSetNPosting(itup, n) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \ + } while(0) + +/* + * If tuple is posting, t_tid.ip_blkid contains offset of the posting list + */ +#define BTreeTupleGetPostingOffset(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \ + ) +#define BTreeSetPostingMeta(itup, nposting, off) \ + do { \ + BTreeTupleSetNPosting(itup, nposting); \ + Assert(BTreeTupleIsPosting(itup)); \ + ItemPointerSetBlockNumber(&((itup)->t_tid), (off)); \ + } while(0) + +#define BTreeTupleGetPosting(itup) \ + (ItemPointer) ((char*) (itup) + BTreeTupleGetPostingOffset(itup)) +#define BTreeTupleGetPostingN(itup,n) \ + (BTreeTupleGetPosting(itup) + (n)) /* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ @@ -327,40 +442,69 @@ typedef struct BTMetaPageData */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + (BTreeTupleIsPivot(itup)) ? \ ( \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) + +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int n) +{ + Assert(!BTreeTupleIsPosting(itup)); + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, n & BT_N_KEYS_OFFSET_MASK); +} /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Get tiebreaker heap TID attribute, if any. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &(itup->t_tid); +} + /* - * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. Works with + * non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxHeapTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup) - 1); + + return &(itup->t_tid); +} + +/* + * Set the heap TID attribute for a pivot tuple */ #define BTreeTupleSetAltHeapTID(itup) \ do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(BTreeTupleIsPivot(itup)); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ } while(0) @@ -435,6 +579,11 @@ typedef BTStackData *BTStack; * indexes whose version is >= version 4. It's convenient to keep this close * by, rather than accessing the metapage repeatedly. * + * safededup is set to indicate that index may use dynamic deduplication + * safely (index storage parameter separately indicates if deduplication is + * currently in use). This is also a property of the index relation rather + * than an indexscan that is kept around for convenience. + * * anynullkeys indicates if any of the keys had NULL value when scankey was * built from index tuple (note that already-truncated tuple key attributes * set NULL as a placeholder key value, which also affects value of @@ -470,6 +619,7 @@ typedef BTStackData *BTStack; typedef struct BTScanInsertData { bool heapkeyspace; + bool safededup; bool anynullkeys; bool nextkey; bool pivotsearch; @@ -508,10 +658,70 @@ typedef struct BTInsertStateData bool bounds_valid; OffsetNumber low; OffsetNumber stricthigh; + + /* + * if _bt_binsrch_insert found the location inside existing posting list, + * save the position inside the list. This will be -1 in rare cases where + * the overlapping posting list is LP_DEAD. + */ + int postingoff; } BTInsertStateData; typedef BTInsertStateData *BTInsertState; +/* + * State used to representing a pending posting list during deduplication. + * + * Each entry represents a group of consecutive items from the page, starting + * from page offset number 'baseoff', which is the offset number of the "base" + * tuple on the page undergoing deduplication. 'nitems' is the total number + * of items from the page that will be merged to make a new posting tuple. + * + * Note: 'nitems' means the number of physical index tuples/line pointers on + * the page, starting with and including the item at offset number 'baseoff' + * (so nitems should be at least 2 when interval is used). These existing + * tuples may be posting list tuples or regular tuples. + */ +typedef struct BTDedupInterval +{ + OffsetNumber baseoff; + uint16 nitems; +} BTDedupInterval; + +/* + * Btree-private state used to deduplicate items on a leaf page + */ +typedef struct BTDedupStateData +{ + Relation rel; + /* Deduplication status info for entire page/operation */ + Size maxitemsize; /* Limit on size of final tuple */ + IndexTuple newitem; + bool checkingunique; /* Use unique index strategy? */ + OffsetNumber skippedbase; /* First offset skipped by checkingunique */ + + /* Metadata about current pending posting list */ + ItemPointer htids; /* Heap TIDs in pending posting list */ + int nhtids; /* # heap TIDs in nhtids array */ + int nitems; /* See BTDedupInterval definition */ + Size alltupsize; /* Includes line pointer overhead */ + bool overlap; /* Avoid overlapping posting lists? */ + + /* Metadata about base tuple of current pending posting list */ + IndexTuple base; /* Use to form new posting list */ + OffsetNumber baseoff; /* page offset of base */ + Size basetupsize; /* base size without posting list */ + + /* + * Pending posting list. Contains information about a group of + * consecutive items that will be deduplicated by creating a new posting + * list tuple. + */ + BTDedupInterval interval; +} BTDedupStateData; + +typedef BTDedupStateData *BTDedupState; + /* * BTScanOpaqueData is the btree-private state needed for an indexscan. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for @@ -535,7 +745,10 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a "base" tuple once, + * allowing the same key to be returned for each logical tuple associated + * with the physical posting list tuple (i.e. for each TID from the posting + * list). */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -568,6 +781,12 @@ typedef struct BTScanPosData */ int nextTupleOffset; + /* + * Posting list tuples use postingTupleOffset to store the current + * location of the tuple that is returned multiple times. + */ + int postingTupleOffset; + /* * The items array is always ordered in index order (ie, increasing * indexoffset). When scanning backwards it is convenient to fill the @@ -579,7 +798,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxBTreeIndexTuplesPerPage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -687,6 +906,7 @@ typedef struct BTOptions int fillfactor; /* page fill factor in percent (0..100) */ /* fraction of newly inserted tuples prior to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; + bool deduplication; /* Use deduplication where safe? */ } BTOptions; #define BTGetFillFactor(relation) \ @@ -695,8 +915,18 @@ typedef struct BTOptions (relation)->rd_options ? \ ((BTOptions *) (relation)->rd_options)->fillfactor : \ BTREE_DEFAULT_FILLFACTOR) +#define BTGetUseDedup(relation) \ + (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ + relation->rd_rel->relam == BTREE_AM_OID), \ + ((relation)->rd_options ? \ + ((BTOptions *) (relation)->rd_options)->deduplication : \ + BTGetUseDedupGUC(relation))) #define BTGetTargetPageFreeSpace(relation) \ (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) +#define BTGetUseDedupGUC(relation) \ + (relation->rd_index->indisunique ? \ + btree_deduplication == DEDUP_ON : \ + btree_deduplication != DEDUP_OFF) /* * Constant definition for progress reporting. Phase numbers must match @@ -743,6 +973,22 @@ extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +/* + * prototypes for functions in nbtdedup.c + */ +extern void _bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, + IndexTuple newitem, Size newitemsz, + bool checkingunique); +extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber base_off); +extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); +extern Size _bt_dedup_finish_pending(Buffer buffer, BTDedupState state, + bool need_wal); +extern IndexTuple _bt_form_posting(IndexTuple tuple, ItemPointer htids, + int nhtids); +extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, + int postingoff); + /* * prototypes for functions in nbtinsert.c */ @@ -761,7 +1007,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page page, /* * prototypes for functions in nbtpage.c */ -extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool safededup); extern void _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples); extern void _bt_upgrademetapage(Page page); @@ -769,6 +1016,7 @@ extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); extern bool _bt_heapkeyspace(Relation rel); +extern bool _bt_safededup(Relation rel); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, @@ -779,7 +1027,9 @@ extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable); + OffsetNumber *deletable, int ndeletable, + OffsetNumber *updateitemnos, + IndexTuple *updated, int nupdateable); extern int _bt_pagedel(Relation rel, Buffer buf); /* @@ -829,6 +1079,7 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); +extern bool _bt_opclasses_support_dedup(Relation index); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 71435a13b3..d387905cc0 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -28,7 +28,8 @@ #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ -/* 0x50 and 0x60 are unused */ +#define XLOG_BTREE_DEDUP_PAGE 0x50 /* deduplicate tuples on leaf page */ +#define XLOG_BTREE_INSERT_POST 0x60 /* add index tuple with posting split */ #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ @@ -53,21 +54,32 @@ typedef struct xl_btree_metadata uint32 fastlevel; TransactionId oldest_btpo_xact; float8 last_cleanup_num_heap_tuples; + bool btm_safededup; } xl_btree_metadata; /* * This is what we need to know about simple (without split) insert. * - * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. - * Note that INSERT_META implies it's not a leaf page. + * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and + * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a + * leaf page, while INSERT_POST and INSERT_LEAF imply that it is. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META + * + * Note: The new tuple is actually the "original" new item in the posting + * list split insert case (i.e. the INSERT_POST case). A split offset for + * the posting list is logged before the original new item. Recovery needs + * both, since it must do an in-place update of the existing posting list + * that was split as an extra step. Also, recovery generates a "final" + * newitem. See _bt_swap_posting(). */ typedef struct xl_btree_insert { OffsetNumber offnum; + /* posting split offset (INSERT_POST only) */ + /* new tuple that was inserted (or orignewitem in INSERT_POST case) */ } xl_btree_insert; #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) @@ -91,9 +103,18 @@ typedef struct xl_btree_insert * * Backup Blk 0: original page / new left page * - * The left page's data portion contains the new item, if it's the _L variant. - * An IndexTuple representing the high key of the left page must follow with - * either variant. + * The left page's data portion contains the new item, if it's the _L variant + * (though _R variant page split records with a posting list split sometimes + * need to include newitem). An IndexTuple representing the high key of the + * left page must follow in all cases. + * + * The newitem is actually an "original" newitem when a posting list split + * occurs that requires than the original posting list be updated in passing. + * Recovery recognizes this case when postingoff is set. This corresponds to + * the xl_btree_insert INSERT_POST case. Note that postingoff will be set to + * zero (no posting split) when a posting list split occurs where both + * original posting list and newitem go on the right page, since recovery + * doesn't need to consider the posting list split at all. * * Backup Blk 1: new right page * @@ -111,10 +132,26 @@ typedef struct xl_btree_split { uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) + +/* + * When page is deduplicated, consecutive groups of tuples with equal keys are + * merged together into posting list tuples. + * + * The WAL record represents the interval that describes the posing tuple + * that should be added to the page. + */ +typedef struct xl_btree_dedup +{ + OffsetNumber baseoff; + uint16 nitems; +} xl_btree_dedup; + +#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nitems) + sizeof(uint16)) /* * This is what we need to know about delete of individual leaf index tuples. @@ -148,19 +185,25 @@ typedef struct xl_btree_reuse_page /* * This is what we need to know about vacuum of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a - * single index page when executed by VACUUM. + * single index page when executed by VACUUM. It can also support "updates" + * of index tuples, which are actually deletions of "logical" tuples contained + * in an existing posting list tuple that will still have some remaining + * logical tuples once VACUUM finishes. * * Note that the WAL record in any vacuum of an index must have at least one - * item to delete. + * item to delete or update. */ typedef struct xl_btree_vacuum { - uint32 ndeleted; + uint16 ndeleted; + uint16 nupdated; /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES TO ADD BACK FOLLOW */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(uint32)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) /* * This is what we need to know about marking an empty branch for deletion. @@ -241,6 +284,8 @@ typedef struct xl_btree_newroot extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_xlog_startup(void); +extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3c0db2ccf5..2b8c6c7fc8 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 48377ace24..2b37afd9e5 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -158,6 +158,15 @@ static relopt_bool boolRelOpts[] = }, true }, + { + { + "deduplication", + "Enables deduplication on btree index leaf pages", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + true + }, /* list terminator */ {{NULL}} }; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 2599b5d342..6e1dc596e1 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -276,6 +276,10 @@ BuildIndexValueDescription(Relation indexRelation, /* * Get the latestRemovedXid from the table entries pointed at by the index * tuples being deleted. + * + * Note: index access methods that don't consistently use the standard + * IndexTuple + heap TID item pointer representation will need to provide + * their own version of this function. */ TransactionId index_compute_xid_horizon_for_tuples(Relation irel, diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index bf245f5dab..d69808e78c 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ nbtcompare.o \ + nbtdedup.o \ nbtinsert.o \ nbtpage.o \ nbtree.o \ diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 6db203e75c..54cb9db49d 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly like a hint bit for a heap tuple), but physically removing tuples requires exclusive lock. In the current code we try to remove LP_DEAD tuples when we are otherwise faced with having to split a page to do an insertion (and -hence have exclusive lock on it already). +hence have exclusive lock on it already). Deduplication can also prevent +a page split, but removing LP_DEAD tuples is the preferred approach. +(Note that posting list tuples can only have their LP_DEAD bit set when +every "logical" tuple represented within the posting list is known dead.) This leaves the index in a state where it has no entry for a dead tuple that still exists in the heap. This is not a problem for the current @@ -710,6 +713,75 @@ the fallback strategy assumes that duplicates are mostly inserted in ascending heap TID order. The page is split in a way that leaves the left half of the page mostly full, and the right half of the page mostly empty. +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid or at least delay page splits. Deduplication alters +the physical representation of tuples without changing the logical contents +of the index, and without adding overhead to read queries. Non-pivot +tuples are folded together into a single physical tuple with a posting list +(a simple array of heap TIDs with the standard item pointer format). +Deduplication is always applied lazily, at the point where it would +otherwise be necessary to perform a page split. It occurs only when +LP_DEAD items have been removed, as our last line of defense against +splitting a leaf page. We can set the LP_DEAD bit with posting list +tuples, though only when all table tuples are known dead. (Bitmap scans +cannot perform LP_DEAD bit setting, and are the common case with indexes +that contain lots of duplicates, so this downside is considered +acceptable.) + +Large groups of logical duplicates tend to appear together on the same leaf +page due to the special duplicate logic used when choosing a split point. +This facilitates lazy/dynamic deduplication. Deduplication can reliably +deduplicate a large localized group of duplicates before it can span +multiple leaf pages. Posting list tuples are subject to the same 1/3 of a +page restriction as any other tuple. + +Lazy deduplication allows the page space accounting used during page splits +to have absolutely minimal special case logic for posting lists. A posting +list can be thought of as extra payload that suffix truncation will +reliably truncate away as needed during page splits, just like non-key +columns from an INCLUDE index tuple. An incoming tuple (which might cause +a page split) can always be thought of as a non-posting-list tuple that +must be inserted alongside existing items, without needing to consider +deduplication. Most of the time, that's what actually happens: incoming +tuples are either not duplicates, or are duplicates with a heap TID that +doesn't overlap with any existing posting list tuple. When the incoming +tuple really does overlap with an existing posting list, a posting list +split is performed. Posting list splits work in a way that more or less +preserves the illusion that all incoming tuples do not need to be merged +with any existing posting list tuple. + +Posting list splits work by "overriding" the details of the incoming tuple. +The heap TID of the incoming tuple is altered to make it match the +rightmost heap TID from the existing/originally overlapping posting list. +The offset number that the new/incoming tuple is to be inserted at is +incremented so that it will be inserted to the right of the existing +posting list. The insertion (or page split) operation that completes the +insert does one extra step: an in-place update of the posting list. The +update changes the posting list such that the "true" heap TID from the +original incoming tuple is now contained in the posting list. We make +space in the posting list by removing the heap TID that became the new +item. The size of the posting list won't change, and so the page split +space accounting does not need to care about posting lists. Also, overall +space utilization is improved by keeping existing posting lists large. + +The representation of posting lists is identical to the posting lists used +by GIN, so it would be straightforward to apply GIN's varbyte encoding +compression scheme to individual posting lists. Posting list compression +would break the assumptions made by posting list splits about page space +accounting, though, so it's not clear how compression could be integrated +with nbtree. Besides, posting list compression does not offer a compelling +trade-off for nbtree, since in general nbtree is optimized for consistent +performance with many concurrent readers and writers. A major goal of +nbtree's lazy approach to deduplication is to limit the performance impact +of deduplication with random updates. Even concurrent append-only inserts +of the same key value will tend to have inserts of individual index tuples +in an order that doesn't quite match heap TID order. In general, delaying +deduplication avoids many unnecessary posting list splits, and minimizes +page level fragmentation. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c new file mode 100644 index 0000000000..1dbc32b70a --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup.c @@ -0,0 +1,715 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup.c + * Deduplicate items in Lehman and Yao btrees for Postgres. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/nbtxlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + + +/* + * Try to deduplicate items to free at least enough space to avoid a page + * split. This function should be called during insertion, only after LP_DEAD + * items were removed by _bt_vacuum_one_page() to prevent a page split. + * (We'll have to kill LP_DEAD items here when the page's BTP_HAS_GARBAGE hint + * was not set, but that should be rare.) + * + * The strategy for !checkingunique callers is to perform as much + * deduplication as possible to free as much space as possible now, since + * making it harder to set LP_DEAD bits is considered an acceptable price for + * not having to deduplicate the same page many times. It is unlikely that + * the items on the page will have their LP_DEAD bit set in the future, since + * that hasn't happened before now (besides, entire posting lists can still + * have their LP_DEAD bit set). + * + * The strategy for checkingunique callers is rather different, since the + * overall goal is different. Deduplication cooperates with and enhances + * garbage collection, especially the LP_DEAD bit setting that takes place in + * _bt_check_unique(). Deduplication does as little as possible while still + * preventing a page split for caller, since it's less likely that posting + * lists will have their LP_DEAD bit set. Deduplication avoids creating new + * posting lists with only two heap TIDs, and also avoids creating new posting + * lists from an existing posting list. Deduplication is only useful when it + * delays a page split long enough for garbage collection to prevent the page + * split altogether. checkingunique deduplication can make all the difference + * in cases where VACUUM keeps up with dead index tuples, but "recently dead" + * index tuples are still numerous enough to cause page splits that are truly + * unnecessary. + * + * Note: If newitem contains NULL values in key attributes, caller will be + * !checkingunique even when rel is a unique index. The page in question will + * usually have many existing items with NULLs. + */ +void +_bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, + IndexTuple newitem, Size newitemsz, bool checkingunique) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + BTPageOpaque oopaque; + BTDedupState state = NULL; + int natts = IndexRelationGetNumberOfAttributes(rel); + OffsetNumber deletable[MaxIndexTuplesPerPage]; + bool minimal = checkingunique; + int ndeletable = 0; + Size pagesaving = 0; + int count = 0; + bool singlevalue = false; + + oopaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* init deduplication state needed to build posting tuples */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->rel = rel; + + state->maxitemsize = BTMaxItemSize(page); + state->newitem = newitem; + state->checkingunique = checkingunique; + state->skippedbase = InvalidOffsetNumber; + /* Metadata about current pending posting list */ + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + state->overlap = false; + /* Metadata about based tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Delete dead tuples if any. We cannot simply skip them in the cycle + * below, because it's necessary to generate special Xlog record + * containing such tuples to compute latestRemovedXid on a standby server + * later. + * + * This should not affect performance, since it only can happen in a rare + * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page + * was not called, or _bt_vacuum_one_page didn't remove all dead items. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemid)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + /* + * Skip duplication in rare cases where there were LP_DEAD items + * encountered here when that frees sufficient space for caller to + * avoid a page split + */ + _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + if (PageGetFreeSpace(page) >= newitemsz) + { + pfree(state); + return; + } + + /* Continue with deduplication */ + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* Make sure that new page won't have garbage flag set */ + oopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + /* Conservatively size array */ + state->htids = palloc(state->maxitemsize); + + /* + * Determine if a "single value" strategy page split is likely to occur + * shortly after deduplication finishes. It should be possible for the + * single value split to find a split point that packs the left half of + * the split BTREE_SINGLEVAL_FILLFACTOR% full. + */ + if (!checkingunique) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + /* + * Use different strategy if future page split likely to need to + * use "single value" strategy + */ + if (_bt_keep_natts_fast(rel, newitem, itup) > natts) + singlevalue = true; + } + } + + /* + * Iterate over tuples on the page, try to deduplicate them into posting + * lists and insert into new page. NOTE: It's essential to reassess the + * max offset on each iteration, since it will change as items are + * deduplicated. + */ + offnum = minoff; +retry: + while (offnum <= PageGetMaxOffsetNumber(page)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (state->nitems == 0) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (_bt_keep_natts_fast(rel, state->base, itup) > natts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. The next iteration + * will also end up here if it's possible to merge the next tuple + * into the same pending posting list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed BTMaxItemSize() + * limit). + * + * If state contains pending posting list with more than one item, + * form new posting tuple, and update the page. Otherwise, reset + * the state and move on. + */ + pagesaving += _bt_dedup_finish_pending(buffer, state, + RelationNeedsWAL(rel)); + + count++; + + /* + * When caller is a checkingunique caller and we have deduplicated + * enough to avoid a page split, do minimal deduplication in case + * the remaining items are about to be marked dead within + * _bt_check_unique(). + */ + if (minimal && pagesaving >= newitemsz) + break; + + /* + * Consider special steps when a future page split of the leaf + * page is likely to occur using nbtsplitloc.c's "single value" + * strategy + */ + if (singlevalue) + { + /* + * Adjust maxitemsize so that there isn't a third and final + * 1/3 of a page width tuple that fills the page to capacity. + * The third tuple produced should be smaller than the first + * two by an amount equal to the free space that nbtsplitloc.c + * is likely to want to leave behind when the page it split. + * When there are 3 posting lists on the page, then we end + * deduplication. Remaining tuples on the page can be + * deduplicated later, when they're on the new right sibling + * of this page, and the new sibling page needs to be split in + * turn. + * + * Note that it doesn't matter if there are items on the page + * that were already 1/3 of a page during current pass; + * they'll still count as the first two posting list tuples. + */ + if (count == 2) + { + Size leftfree; + + /* This calculation needs to match nbtsplitloc.c */ + leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - + MAXALIGN(sizeof(BTPageOpaqueData)); + /* Subtract predicted size of new high key */ + leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); + + /* + * Reduce maxitemsize by an amount equal to target free + * space on left half of page + */ + state->maxitemsize -= leftfree * + ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0); + } + else if (count == 3) + break; + } + + /* + * Next iteration starts immediately after base tuple offset (this + * will be the next offset on the page when we didn't modify the + * page) + */ + offnum = state->baseoff; + } + + offnum = OffsetNumberNext(offnum); + } + + /* Handle the last item when pending posting list is not empty */ + if (state->nitems != 0) + { + pagesaving += _bt_dedup_finish_pending(buffer, state, + RelationNeedsWAL(rel)); + count++; + } + + if (pagesaving < newitemsz && state->skippedbase != InvalidOffsetNumber) + { + /* + * Didn't free enough space for new item in first checkingunique pass. + * Try making a second pass over the page, this time starting from the + * first candidate posting list base offset that was skipped over in + * the first pass (only do a second pass when this actually happened). + * + * The second pass over the page may deduplicate items that were + * initially passed over due to concerns about limiting the + * effectiveness of LP_DEAD bit setting within _bt_check_unique(). + * Note that the second pass will still stop deduplicating as soon as + * enough space has been freed to avoid an immediate page split. + */ + Assert(state->checkingunique); + offnum = state->skippedbase; + + state->checkingunique = false; + state->skippedbase = InvalidOffsetNumber; + state->alltupsize = 0; + state->nitems = 0; + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + goto retry; + } + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* be tidy */ + pfree(state->htids); + pfree(state); +} + +/* + * Create a new pending posting list tuple based on caller's tuple. + * + * Every tuple processed by the deduplication routines either becomes the base + * tuple for a posting list, or gets its heap TID(s) accepted into a pending + * posting list. A tuple that starts out as the base tuple for a posting list + * will only actually be rewritten within _bt_dedup_finish_pending() when + * there was at least one successful call to _bt_dedup_save_htid(). + */ +void +_bt_dedup_start_pending(BTDedupState state, IndexTuple base, + OffsetNumber baseoff) +{ + Assert(state->nhtids == 0); + Assert(state->nitems == 0); + + /* + * Copy heap TIDs from new base tuple for new candidate posting list into + * ipd array. Assume that we'll eventually create a new posting tuple by + * merging later tuples with this existing one, though we may not. + */ + if (!BTreeTupleIsPosting(base)) + { + memcpy(state->htids, base, sizeof(ItemPointerData)); + state->nhtids = 1; + /* Save size of tuple without any posting list */ + state->basetupsize = IndexTupleSize(base); + } + else + { + int nposting; + + nposting = BTreeTupleGetNPosting(base); + memcpy(state->htids, BTreeTupleGetPosting(base), + sizeof(ItemPointerData) * nposting); + state->nhtids = nposting; + /* Save size of tuple without any posting list */ + state->basetupsize = BTreeTupleGetPostingOffset(base); + } + + /* + * Save new base tuple itself -- it'll be needed if we actually create a + * new posting list from new pending posting list. + * + * Must maintain size of all tuples (including line pointer overhead) to + * calculate space savings on page within _bt_dedup_finish_pending(). + * Also, save number of base tuple logical tuples so that we can save + * cycles in the common case where an existing posting list can't or won't + * be merged with other tuples on the page. + */ + state->nitems = 1; + state->base = base; + state->baseoff = baseoff; + state->alltupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData); + /* Also save baseoff in pending state for interval */ + state->interval.baseoff = state->baseoff; + state->overlap = false; + if (state->newitem) + { + /* Might overlap with new item -- mark it as possible if it is */ + if (BTreeTupleGetHeapTID(base) < BTreeTupleGetHeapTID(state->newitem)) + state->overlap = true; + } +} + +/* + * Save itup heap TID(s) into pending posting list where possible. + * + * Returns bool indicating if the pending posting list managed by state has + * itup's heap TID(s) saved. When this is false, enlarging the pending + * posting list by the required amount would exceed the maxitemsize limit, so + * caller must finish the pending posting list tuple. (Generally itup becomes + * the base tuple of caller's new pending posting list). + */ +bool +_bt_dedup_save_htid(BTDedupState state, IndexTuple itup) +{ + int nhtids; + ItemPointer htids; + Size mergedtupsz; + + if (!BTreeTupleIsPosting(itup)) + { + nhtids = 1; + htids = &itup->t_tid; + } + else + { + nhtids = BTreeTupleGetNPosting(itup); + htids = BTreeTupleGetPosting(itup); + } + + /* + * Don't append (have caller finish pending posting list as-is) if + * appending heap TID(s) from itup would put us over limit + */ + mergedtupsz = MAXALIGN(state->basetupsize + + (state->nhtids + nhtids) * + sizeof(ItemPointerData)); + + if (mergedtupsz > state->maxitemsize) + return false; + + /* Don't merge existing posting lists with checkingunique */ + if (state->checkingunique && + (BTreeTupleIsPosting(state->base) || nhtids > 1)) + { + /* May begin here if second pass over page is required */ + if (state->skippedbase == InvalidOffsetNumber) + state->skippedbase = state->baseoff; + return false; + } + + if (state->overlap) + { + if (BTreeTupleGetMaxHeapTID(itup) > BTreeTupleGetHeapTID(state->newitem)) + { + /* + * newitem has heap TID in the range of the would-be new posting + * list. Avoid an immediate posting list split for caller. + */ + if (_bt_keep_natts_fast(state->rel, state->newitem, itup) > + IndexRelationGetNumberOfAttributes(state->rel)) + { + state->newitem = NULL; /* avoid unnecessary comparisons */ + return false; + } + } + } + + /* + * Save heap TIDs to pending posting list tuple -- itup can be merged into + * pending posting list + */ + state->nitems++; + memcpy(state->htids + state->nhtids, htids, + sizeof(ItemPointerData) * nhtids); + state->nhtids += nhtids; + state->alltupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); + + return true; +} + +/* + * Finalize pending posting list tuple, and add it to the page. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * Returns space saving from deduplicating to make a new posting list tuple. + * Note that this includes line pointer overhead. This is zero in the case + * where no deduplication was possible. + */ +Size +_bt_dedup_finish_pending(Buffer buffer, BTDedupState state, bool need_wal) +{ + Size spacesaving = 0; + Page page = BufferGetPage(buffer); + int minimum = 2; + + Assert(state->nitems > 0); + Assert(state->nitems <= state->nhtids); + Assert(state->interval.baseoff == state->baseoff); + + /* + * Only create a posting list when at least 3 heap TIDs will appear in the + * checkingunique case (checkingunique strategy won't merge existing + * posting list tuples, so we know that the number of items here must also + * be the total number of heap TIDs). Creating a new posting lists with + * only two heap TIDs won't even save enough space to fit another + * duplicate with the same key as the posting list. This is a bad + * trade-off if there is a chance that the LP_DEAD bit can be set for + * either existing tuple by putting off deduplication. + * + * (Note that a second pass over the page can deduplicate the item if that + * is truly the only way to avoid a page split for checkingunique caller) + */ + Assert(!state->checkingunique || state->nitems == 1 || + state->nhtids == state->nitems); + if (state->checkingunique) + { + minimum = 3; + /* May begin here if second pass over page is required */ + if (state->nitems == 2 && state->skippedbase == InvalidOffsetNumber) + state->skippedbase = state->baseoff; + } + + if (state->nitems >= minimum) + { + IndexTuple final; + Size finalsz; + OffsetNumber offnum; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + + /* find all tuples that will be replaced with this new posting tuple */ + for (offnum = state->baseoff; + offnum < state->baseoff + state->nitems; + offnum = OffsetNumberNext(offnum)) + deletable[ndeletable++] = offnum; + + /* Form a tuple with a posting list */ + final = _bt_form_posting(state->base, state->htids, state->nhtids); + finalsz = IndexTupleSize(final); + spacesaving = state->alltupsize - (finalsz + sizeof(ItemIdData)); + /* Must have saved some space */ + Assert(spacesaving > 0 && spacesaving < BLCKSZ); + + /* Save final number of items for posting list */ + state->interval.nitems = state->nitems; + + Assert(finalsz <= state->maxitemsize); + Assert(finalsz == MAXALIGN(IndexTupleSize(final))); + + START_CRIT_SECTION(); + + /* Delete items to replace */ + PageIndexMultiDelete(page, deletable, ndeletable); + /* Insert posting tuple */ + if (PageAddItem(page, (Item) final, finalsz, state->baseoff, false, + false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + MarkBufferDirty(buffer); + + /* Log deduplicated items */ + if (need_wal) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.baseoff = state->interval.baseoff; + xlrec_dedup.nitems = state->interval.nitems; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + pfree(final); + } + + /* Reset state for next pending posting list */ + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + + return spacesaving; +} + +/* + * Build a posting list tuple from a "base" index tuple and a list of heap + * TIDs for posting list. + * + * Caller's "htids" array must be sorted in ascending order. Any heap TIDs + * from caller's base tuple will not appear in returned posting list. + * + * If nhtids == 1, builds a non-posting tuple (posting list tuples can never + * have a single heap TID). + */ +IndexTuple +_bt_form_posting(IndexTuple tuple, ItemPointer htids, int nhtids) +{ + uint32 keysize, + newsize = 0; + IndexTuple itup; + + /* We only need key part of the tuple */ + if (BTreeTupleIsPosting(tuple)) + keysize = BTreeTupleGetPostingOffset(tuple); + else + keysize = IndexTupleSize(tuple); + + Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX); + + /* Add space needed for posting list */ + if (nhtids > 1) + newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nhtids; + else + newsize = keysize; + + newsize = MAXALIGN(newsize); + itup = palloc0(newsize); + memcpy(itup, tuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nhtids > 1) + { + /* Form posting tuple, fill posting fields */ + + itup->t_info |= INDEX_ALT_TID_MASK; + BTreeSetPostingMeta(itup, nhtids, SHORTALIGN(keysize)); + /* Copy posting list into the posting tuple */ + memcpy(BTreeTupleGetPosting(itup), htids, + sizeof(ItemPointerData) * nhtids); + +#ifdef USE_ASSERT_CHECKING + { + /* Assert that htid array is sorted and has unique TIDs */ + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + current = BTreeTupleGetPostingN(itup, i); + Assert(ItemPointerCompare(current, &last) > 0); + ItemPointerCopy(current, &last); + } + } +#endif + } + else + { + /* To finish building of a non-posting tuple, copy TID from htids */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(htids, &itup->t_tid); + } + + return itup; +} + +/* + * Prepare for a posting list split by swapping heap TID in newitem with heap + * TID from original posting list (the 'oposting' heap TID located at offset + * 'postingoff'). + * + * Returns new posting list tuple, which is palloc()'d in caller's context. + * This is guaranteed to be the same size as 'oposting'. Modified version of + * newitem is what caller actually inserts inside the critical section that + * also performs an in-place update of posting list. + * + * Explicit WAL-logging of newitem must use the original version of newitem in + * order to make it possible for our nbtxlog.c callers to correctly REDO + * original steps. This approach avoids any explicit WAL-logging of a posting + * list tuple. This is important because posting lists are often much larger + * than plain tuples. + * + * Caller should avoid assuming that the IndexTuple-wise key representation in + * newitem is bitwise equal to the representation used within oposting. Note, + * in particular, that one may even be larger than the other. This could + * occur due to differences in TOAST input state, for example. + */ +IndexTuple +_bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff) +{ + int nhtids; + char *replacepos; + char *rightpos; + Size nbytes; + IndexTuple nposting; + + nhtids = BTreeTupleGetNPosting(oposting); + Assert(postingoff > 0 && postingoff < nhtids); + + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff); + rightpos = replacepos + sizeof(ItemPointerData); + nbytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID (shift TIDs one place to the right, losing original rightmost + * TID) + */ + memmove(rightpos, replacepos, nbytes); + + /* Fill the gap with the TID of the new item */ + ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos); + + /* Copy original posting list's rightmost TID into new item */ + ItemPointerCopy(BTreeTupleGetPostingN(oposting, nhtids - 1), + &newitem->t_tid); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting), + BTreeTupleGetHeapTID(newitem)) < 0); + Assert(BTreeTupleGetNPosting(oposting) == BTreeTupleGetNPosting(nposting)); + + return nposting; +} diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b93b2a0ffd..d816c45f2c 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -28,6 +28,8 @@ /* Minimum tree height for application of fastpath optimization */ #define BTREE_FASTPATH_MIN_LEVEL 2 +/* GUC parameter */ +int btree_deduplication = DEDUP_NONUNIQUE; static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); @@ -47,10 +49,12 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem); + IndexTuple newitem, IndexTuple orignewitem, + IndexTuple nposting, uint16 postingoff); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, @@ -61,7 +65,8 @@ static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * * This routine is called by the public interface routine, btinsert. - * By here, itup is filled in, including the TID. + * By here, itup is filled in, including the TID. Caller should be + * prepared for us to scribble on 'itup'. * * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or @@ -125,6 +130,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, insertstate.itup_key = itup_key; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; /* * It's very common to have an index on an auto-incremented or @@ -300,7 +306,7 @@ top: newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + itup, newitemoff, insertstate.postingoff, false); } else { @@ -353,6 +359,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, BTPageOpaque opaque; Buffer nbuf = InvalidBuffer; bool found = false; + bool inposting = false; + bool prev_all_dead = true; + int curposti = 0; /* Assume unique until we find a duplicate */ *is_unique = true; @@ -374,6 +383,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* * Scan over all equal tuples, looking for live conflicts. + * + * Note that each iteration of the loop processes one heap TID, not one + * index tuple. The page offset number won't be advanced for iterations + * which process heap TIDs from posting list tuples until the last such + * heap TID for the posting list (curposti will be advanced instead). */ Assert(!insertstate->bounds_valid || insertstate->low == offset); Assert(!itup_key->anynullkeys); @@ -435,7 +449,27 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* okay, we gotta fetch the heap tuple ... */ curitup = (IndexTuple) PageGetItem(page, curitemid); - htid = curitup->t_tid; + + /* + * decide if this is the first heap TID in tuple we'll + * process, or if we should continue to process current + * posting list + */ + if (!BTreeTupleIsPosting(curitup)) + { + htid = curitup->t_tid; + inposting = false; + } + else if (!inposting) + { + /* First heap TID in posting list */ + inposting = true; + prev_all_dead = true; + curposti = 0; + } + + if (inposting) + htid = *BTreeTupleGetPostingN(curitup, curposti); /* * If we are doing a recheck, we expect to find the tuple we @@ -511,8 +545,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * not part of this chain because it had a different index * entry. */ - htid = itup->t_tid; - if (table_index_fetch_tuple_check(heapRel, &htid, + if (table_index_fetch_tuple_check(heapRel, &itup->t_tid, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ @@ -570,12 +603,14 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, RelationGetRelationName(rel)))); } } - else if (all_dead) + else if (all_dead && (!inposting || + (prev_all_dead && + curposti == BTreeTupleGetNPosting(curitup) - 1))) { /* - * The conflicting tuple (or whole HOT chain) is dead to - * everyone, so we may as well mark the index entry - * killed. + * The conflicting tuple (or all HOT chains pointed to by + * all posting list TIDs) is dead to everyone, so mark the + * index entry killed. */ ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; @@ -589,14 +624,29 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else MarkBufferDirtyHint(insertstate->buf, true); } + + /* + * Remember if posting list tuple has even a single HOT chain + * whose members are not all dead + */ + if (!all_dead && inposting) + prev_all_dead = false; } } - /* - * Advance to next tuple to continue checking. - */ - if (offset < maxoff) + if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) + { + /* Advance to next TID in same posting list */ + curposti++; + continue; + } + else if (offset < maxoff) + { + /* Advance to next tuple */ + curposti = 0; + inposting = false; offset = OffsetNumberNext(offset); + } else { int highkeycmp; @@ -621,6 +671,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); } + curposti = 0; + inposting = false; maxoff = PageGetMaxOffsetNumber(page); offset = P_FIRSTDATAKEY(opaque); /* Don't invalidate binary search bounds */ @@ -689,6 +741,7 @@ _bt_findinsertloc(Relation rel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque lpageop; + OffsetNumber location; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -751,13 +804,25 @@ _bt_findinsertloc(Relation rel, /* * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items + * erasing LP_DEAD items. If that doesn't work out, and if the index + * deduplication is both possible and enabled, try deduplication. */ - if (PageGetFreeSpace(page) < insertstate->itemsz && - P_HAS_GARBAGE(lpageop)) + if (PageGetFreeSpace(page) < insertstate->itemsz) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; + } + + if (insertstate->itup_key->safededup && BTGetUseDedup(rel) && + PageGetFreeSpace(page) < insertstate->itemsz) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itup, insertstate->itemsz, + checkingunique); + insertstate->bounds_valid = false; + } } } else @@ -839,7 +904,38 @@ _bt_findinsertloc(Relation rel, Assert(P_RIGHTMOST(lpageop) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); - return _bt_binsrch_insert(rel, insertstate); + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Insertion is not prepared for the case where an LP_DEAD posting list + * tuple must be split. In the unlikely event that this happens, call + * _bt_dedup_one_page() to force it to kill all LP_DEAD items. + */ + if (unlikely(insertstate->postingoff == -1)) + { + Assert(insertstate->itup_key->safededup); + + /* + * Don't check if the option is enabled, since no actual deduplication + * will be done, just cleanup. + */ + _bt_dedup_one_page(rel, insertstate->buf, heapRel, insertstate->itup, + 0, checkingunique); + Assert(!P_HAS_GARBAGE(lpageop)); + + /* Must reset insertstate ahead of new _bt_binsrch_insert() call */ + insertstate->bounds_valid = false; + insertstate->postingoff = 0; + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Might still have to split some other posting list now, but that + * should never be LP_DEAD + */ + Assert(insertstate->postingoff >= 0); + } + + return location; } /* @@ -905,10 +1001,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * This recursive procedure does the following things: * + * + if necessary, splits an existing posting list on page. + * This is only needed when 'postingoff' is non-zero. * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). - * + inserts the tuple. + * + inserts the new tuple (could be from split posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). @@ -918,7 +1016,8 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, - * we will have dropped both the pin and the lock on the buffer. + * we will have dropped both the pin and the lock on the buffer. Caller + * should be prepared for us to scribble on 'itup'. * * This routine only performs retail tuple insertions. 'itup' should * always be either a non-highkey leaf item, or a downlink (new high @@ -936,11 +1035,15 @@ _bt_insertonpg(Relation rel, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int postingoff, bool split_only_page) { Page page; BTPageOpaque lpageop; Size itemsz; + IndexTuple oposting; + IndexTuple origitup = NULL; + IndexTuple nposting = NULL; page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -954,6 +1057,8 @@ _bt_insertonpg(Relation rel, Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); + /* retail insertions of posting list tuples are disallowed */ + Assert(!BTreeTupleIsPosting(itup)); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -964,6 +1069,39 @@ _bt_insertonpg(Relation rel, itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ + /* + * Do we need to split an existing posting list item? + */ + if (postingoff != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple on a leaf page. Prepare to + * split an existing posting list by swapping new item's heap TID with + * the rightmost heap TID from original posting list, and generating a + * new version of the posting list that has new item's heap TID. + * + * Posting list splits work by modifying the overlapping posting list + * as part of the same atomic operation that inserts the "new item". + * The space accounting is kept simple, since it does not need to + * consider posting list splits at all (this is particularly important + * for the case where we also have to split the page). Overwriting + * the posting list with its post-split version is treated as an extra + * step in either the insert or page split critical section. + */ + Assert(P_ISLEAF(lpageop) && !ItemIdIsDead(itemid)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* save a copy of itup with unchanged TID for xlog record */ + origitup = CopyIndexTuple(itup); + nposting = _bt_swap_posting(itup, oposting, postingoff); + + /* Alter offset so that it goes after existing posting list */ + newitemoff = OffsetNumberNext(newitemoff); + } + /* * Do we need to split the page to fit the item on it? * @@ -996,7 +1134,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup); + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + origitup, nposting, postingoff); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1075,6 +1214,13 @@ _bt_insertonpg(Relation rel, elog(PANIC, "failed to add new item to block %u in index \"%s\"", itup_blkno, RelationGetRelationName(rel)); + /* + * Posting list split requires an in-place update of the existing + * posting list + */ + if (nposting) + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + MarkBufferDirty(buf); if (BufferIsValid(metabuf)) @@ -1120,8 +1266,19 @@ _bt_insertonpg(Relation rel, XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); - if (P_ISLEAF(lpageop)) + if (P_ISLEAF(lpageop) && postingoff == 0) + { + /* Simple leaf insert */ xlinfo = XLOG_BTREE_INSERT_LEAF; + } + else if (postingoff != 0) + { + /* + * Leaf insert with posting list split. Must include + * postingoff field before newitem/orignewitem. + */ + xlinfo = XLOG_BTREE_INSERT_POST; + } else { /* @@ -1144,6 +1301,7 @@ _bt_insertonpg(Relation rel, xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.btm_safededup = metad->btm_safededup; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -1152,7 +1310,28 @@ _bt_insertonpg(Relation rel, } XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + + /* + * We always write newitem to the page, but when there is an + * original newitem due to a posting list split then we log the + * original item instead. REDO routine must reconstruct the final + * newitem at the same time it reconstructs nposting. + */ + if (postingoff == 0) + XLogRegisterBufData(0, (char *) itup, + IndexTupleSize(itup)); + else + { + /* + * Must explicitly log posting off before newitem in case of + * posting list split. + */ + uint16 upostingoff = postingoff; + + XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16)); + XLogRegisterBufData(0, (char *) origitup, + IndexTupleSize(origitup)); + } recptr = XLogInsert(RM_BTREE_ID, xlinfo); @@ -1194,6 +1373,13 @@ _bt_insertonpg(Relation rel, _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, cachedBlock); } + + /* be tidy */ + if (postingoff != 0) + { + pfree(nposting); + pfree(origitup); + } } /* @@ -1209,12 +1395,25 @@ _bt_insertonpg(Relation rel, * This function will clear the INCOMPLETE_SPLIT flag on it, and * release the buffer. * + * orignewitem, nposting, and postingoff are needed when an insert of + * orignewitem results in both a posting list split and a page split. + * newitem and nposting are replacements for orignewitem and the + * existing posting list on the page respectively. These extra + * posting list split details are used here in the same way as they + * are used in the more common case where a posting list split does + * not coincide with a page split. We need to deal with posting list + * splits directly in order to ensure that everything that follows + * from the insert of orignewitem is handled as a single atomic + * operation (though caller's insert of a new pivot/downlink into + * parent page will still be a separate operation). + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem) + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) { Buffer rbuf; Page origpage; @@ -1236,12 +1435,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber firstright; OffsetNumber maxoff; OffsetNumber i; + OffsetNumber replacepostingoff = InvalidOffsetNumber; bool newitemonleft, isleaf; IndexTuple lefthikey; int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + /* + * Determine offset number of existing posting list on page when a split + * of a posting list needs to take place as the page is split + */ + if (nposting != NULL) + { + Assert(itup_key->heapkeyspace); + replacepostingoff = OffsetNumberPrev(newitemoff); + } + /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back @@ -1273,6 +1483,13 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * newitemoff == firstright. In all other cases it's clear which side of * the split every tuple goes on from context. newitemonleft is usually * (but not always) redundant information. + * + * Note: In theory, the split point choice logic should operate against a + * version of the page that already replaced the posting list at offset + * replacepostingoff with nposting where applicable. We don't bother with + * that, though. Both versions of the posting list must be the same size, + * and both will have the same base tuple key values, so split point + * choice is never affected. */ firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); @@ -1340,6 +1557,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, firstright); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (firstright == replacepostingoff) + item = nposting; } /* @@ -1373,6 +1593,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (lastleftoff == replacepostingoff) + lastleft = nposting; } Assert(lastleft != item); @@ -1480,8 +1703,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* + * did caller pass new replacement posting list tuple due to posting + * list split? + */ + if (i == replacepostingoff) + { + /* + * swap origpage posting list with post-posting-list-split version + * from caller + */ + Assert(isleaf); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + item = nposting; + } + /* does new item belong before this one? */ - if (i == newitemoff) + else if (i == newitemoff) { if (newitemonleft) { @@ -1650,8 +1888,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, XLogRecPtr recptr; xlrec.level = ropaque->btpo.level; + /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstright = firstright; xlrec.newitemoff = newitemoff; + xlrec.postingoff = 0; + if (replacepostingoff < firstright) + xlrec.postingoff = postingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); @@ -1670,11 +1912,45 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * because it's included with all the other items on the right page.) * Show the new item as belonging to the left page buffer, so that it * is not stored if XLogInsert decides it needs a full-page image of - * the left page. We store the offset anyway, though, to support - * archive compression of these records. + * the left page. We always store newitemoff in the record, though. + * + * The details are sometimes slightly different for page splits that + * coincide with a posting list split. If both the replacement + * posting list and newitem go on the right page, then we don't need + * to log anything extra, just like the simple !newitemonleft + * no-posting-split case (postingoff is set to zero in the WAL record, + * so recovery doesn't need to process a posting list split at all). + * Otherwise, we set postingoff and log orignewitem instead of + * newitem, despite having actually inserted newitem. Recovery must + * reconstruct nposting and newitem using _bt_swap_posting(). + * + * Note: It's possible that our page split point is the point that + * makes the posting list lastleft and newitem firstright. This is + * the only case where we log orignewitem despite newitem going on the + * right page. If XLogInsert decides that it can omit orignewitem due + * to logging a full-page image of the left page, everything still + * works out, since recovery only needs to log orignewitem for items + * on the left page (just like the regular newitem-logged case). */ - if (newitemonleft) - XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + if (newitemonleft || xlrec.postingoff != 0) + { + if (xlrec.postingoff == 0) + { + /* Must WAL-log newitem, since it's on left page */ + Assert(newitemonleft); + Assert(orignewitem == NULL && nposting == NULL); + XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + } + else + { + /* Must WAL-log orignewitem following posting list split */ + Assert(newitemonleft || firstright == newitemoff); + Assert(ItemPointerCompare(&orignewitem->t_tid, + &newitem->t_tid) < 0); + XLogRegisterBufData(0, (char *) orignewitem, + MAXALIGN(IndexTupleSize(orignewitem))); + } + } /* Log the left page's new high key */ itemid = PageGetItemId(origpage, P_HIKEY); @@ -1834,7 +2110,7 @@ _bt_insert_parent(Relation rel, /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, - new_item, stack->bts_offset + 1, + new_item, stack->bts_offset + 1, 0, is_only); /* be tidy */ @@ -2190,6 +2466,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.fastlevel = metad->btm_level; md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + md.btm_safededup = metad->btm_safededup; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -2303,6 +2580,6 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * Note: if we didn't find any LP_DEAD items, then the page's * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a * separate write to clear it, however. We will clear it when we split - * the page. + * the page (or when deduplication runs). */ } diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 66c79623cf..3b49eb0762 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -42,12 +43,18 @@ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BlockNumber *target, BlockNumber *rightsib); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid); +static TransactionId _bt_compute_xid_horizon_for_tuples(Relation rel, + Relation heapRel, + Buffer buf, + OffsetNumber *itemnos, + int nitems); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image */ void -_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) +_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, + bool safededup) { BTMetaPageData *metad; BTPageOpaque metaopaque; @@ -63,6 +70,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_fastlevel = level; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + metad->btm_safededup = safededup; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -102,6 +110,9 @@ _bt_upgrademetapage(Page page) metad->btm_version = BTREE_NOVAC_VERSION; metad->btm_oldest_btpo_xact = InvalidTransactionId; metad->btm_last_cleanup_num_heap_tuples = -1.0; + /* Only a REINDEX can set this field */ + Assert(!metad->btm_safededup); + metad->btm_safededup = false; /* Adjust pd_lower (see _bt_initmetapage() for details) */ ((PageHeader) page)->pd_lower = @@ -213,6 +224,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.fastlevel = metad->btm_fastlevel; md.oldest_btpo_xact = oldestBtpoXact; md.last_cleanup_num_heap_tuples = numHeapTuples; + md.btm_safededup = metad->btm_safededup; XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); @@ -274,6 +286,8 @@ _bt_getroot(Relation rel, int access) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || + metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -394,6 +408,7 @@ _bt_getroot(Relation rel, int access) md.fastlevel = 0; md.oldest_btpo_xact = InvalidTransactionId; md.last_cleanup_num_heap_tuples = -1.0; + md.btm_safededup = metad->btm_safededup; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -618,6 +633,7 @@ _bt_getrootheight(Relation rel) Assert(metad->btm_magic == BTREE_MAGIC); Assert(metad->btm_version >= BTREE_MIN_VERSION); Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION); Assert(metad->btm_fastroot != P_NONE); return metad->btm_fastlevel; @@ -683,6 +699,56 @@ _bt_heapkeyspace(Relation rel) return metad->btm_version > BTREE_NOVAC_VERSION; } +/* + * _bt_safededup() -- can deduplication safely be used by index? + * + * Uses field from index relation's metapage/cached metapage. + */ +bool +_bt_safededup(Relation rel) +{ + BTMetaPageData *metad; + + if (rel->rd_amcache == NULL) + { + Buffer metabuf; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metad = _bt_getmeta(rel, metabuf); + + /* + * If there's no root page yet, _bt_getroot() doesn't expect a cache + * to be made, so just stop here. (XXX perhaps _bt_getroot() should + * be changed to allow this case.) + * + * Note that we rely on the assumption that this field will be zero'ed + * on indexes that were pg_upgrade'd. + */ + if (metad->btm_root == P_NONE) + { + _bt_relbuf(rel, metabuf); + return metad->btm_safededup;; + } + + /* Cache the metapage data for next time */ + rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(BTMetaPageData)); + memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); + _bt_relbuf(rel, metabuf); + } + + /* Get cached page */ + metad = (BTMetaPageData *) rel->rd_amcache; + /* We shouldn't have cached it if any of these fail */ + Assert(metad->btm_magic == BTREE_MAGIC); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); + Assert(!metad->btm_safededup || metad->btm_version > BTREE_NOVAC_VERSION); + Assert(metad->btm_fastroot != P_NONE); + + return metad->btm_safededup; +} + /* * _bt_checkpage() -- Verify that a freshly-read page looks sane. */ @@ -968,27 +1034,73 @@ _bt_page_recyclable(Page page) * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer. - * Also, the given deletable array *must* be sorted in ascending order. + * Also, the given deletable and updateitemnos arrays *must* be sorted in + * ascending order. * * We record VACUUMs and b-tree deletes differently in WAL. Deletes must * generate recovery conflicts by accessing the heap inline, whereas VACUUMs * can rely on the initial heap scan taking care of the problem (pruning would - * have generated the conflicts needed for hot standby already). + * have generated the conflicts needed for hot standby already). Also, + * VACUUMs must deal with the case where posting list tuples have some dead + * TIDs, and some remaining TIDs that must not be killed. */ void -_bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, - int ndeletable) +_bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *deletable, int ndeletable, + OffsetNumber *updateitemnos, + IndexTuple *updated, int nupdatable) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Size itemsz; + Size updated_sz = 0; + char *updated_buf = NULL; - Assert(ndeletable > 0); + Assert(ndeletable > 0 || nupdatable > 0); + + /* XLOG stuff, buffer for updated */ + if (nupdatable > 0 && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nupdatable; i++) + updated_sz += MAXALIGN(IndexTupleSize(updated[i])); + + updated_buf = palloc(updated_sz); + for (int i = 0; i < nupdatable; i++) + { + itemsz = IndexTupleSize(updated[i]); + memcpy(updated_buf + offset, (char *) updated[i], itemsz); + offset += MAXALIGN(itemsz); + } + Assert(offset == updated_sz); + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* Handle posting tuple updates */ + for (int i = 0; i < nupdatable; i++) + { + /* + * Delete the old posting tuple first. This will also clear the + * LP_DEAD bit. (It would be correct to leave it set, but we're going + * to unset the BTP_HAS_GARBAGE bit anyway.) + */ + PageIndexTupleDelete(page, updateitemnos[i]); + + itemsz = IndexTupleSize(updated[i]); + itemsz = MAXALIGN(itemsz); + + /* Add tuple with updated ItemPointers to the page */ + if (PageAddItem(page, (Item) updated[i], itemsz, updateitemnos[i], + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to rewrite posting list item in index while doing vacuum"); + } + /* Fix the page */ - PageIndexMultiDelete(page, deletable, ndeletable); + if (ndeletable > 0) + PageIndexMultiDelete(page, deletable, ndeletable); /* * We can clear the vacuum cycle ID since this page has certainly been @@ -1015,6 +1127,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.ndeleted = ndeletable; + xlrec_vacuum.nupdated = nupdatable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1025,8 +1138,22 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, * is. When XLogInsert stores the whole buffer, the offsets array * need not be stored too. */ - XLogRegisterBufData(0, (char *) deletable, ndeletable * - sizeof(OffsetNumber)); + if (ndeletable > 0) + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + /* + * Here we should save offnums and updated tuples themselves. It's + * important to restore them in correct order. At first, we must + * handle updated tuples and only after that other deleted items. + */ + if (nupdatable > 0) + { + Assert(updated_buf != NULL); + XLogRegisterBufData(0, (char *) updateitemnos, + nupdatable * sizeof(OffsetNumber)); + XLogRegisterBufData(0, updated_buf, updated_sz); + } recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); @@ -1036,6 +1163,91 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, END_CRIT_SECTION(); } +/* + * Get the latestRemovedXid from the table entries pointed at by the index + * tuples being deleted. + * + * This is a version of index_compute_xid_horizon_for_tuples() specialized to + * nbtree, which can handle posting lists. + */ +static TransactionId +_bt_compute_xid_horizon_for_tuples(Relation rel, Relation heapRel, + Buffer buf, OffsetNumber *itemnos, + int nitems) +{ + ItemPointer htids; + TransactionId latestRemovedXid = InvalidTransactionId; + Page page = BufferGetPage(buf); + int arraynitems; + int finalnitems; + + /* + * Initial size of array can fit everything when it turns out that are no + * posting lists + */ + arraynitems = nitems; + htids = (ItemPointer) palloc(sizeof(ItemPointerData) * arraynitems); + + finalnitems = 0; + /* identify what the index tuples about to be deleted point to */ + for (int i = 0; i < nitems; i++) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, itemnos[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + /* Make sure that we have space for additional heap TID */ + if (finalnitems + 1 > arraynitems) + { + arraynitems = arraynitems * 2; + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * arraynitems); + } + + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(&itup->t_tid, &htids[finalnitems]); + finalnitems++; + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + /* Make sure that we have space for additional heap TIDs */ + if (finalnitems + nposting > arraynitems) + { + arraynitems = Max(arraynitems * 2, finalnitems + nposting); + htids = (ItemPointer) + repalloc(htids, sizeof(ItemPointerData) * arraynitems); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, j); + + Assert(ItemPointerIsValid(htid)); + ItemPointerCopy(htid, &htids[finalnitems]); + finalnitems++; + } + } + } + + Assert(finalnitems >= nitems); + + /* determine the actual xid horizon */ + latestRemovedXid = + table_compute_xid_horizon_for_tuples(heapRel, htids, finalnitems); + + pfree(htids); + + return latestRemovedXid; +} + /* * Delete item(s) from a btree page during single-page cleanup. * @@ -1046,7 +1258,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, * * This is nearly the same as _bt_delitems_vacuum as far as what it does to * the page, but it needs to generate its own recovery conflicts by accessing - * the heap. See comments for _bt_delitems_vacuum. + * the heap, and doesn't handle updating posting list tuples. See comments + * for _bt_delitems_vacuum. */ void _bt_delitems_delete(Relation rel, Buffer buf, @@ -1062,8 +1275,8 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = - index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - itemnos, nitems); + _bt_compute_xid_horizon_for_tuples(rel, heapRel, buf, + itemnos, nitems); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -2061,6 +2274,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.fastlevel = metad->btm_fastlevel; xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + xlmeta.btm_safededup = metad->btm_safededup; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index bbc1376b0a..8a67193152 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -95,6 +95,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static ItemPointer btreevacuumposting(BTVacState *vstate, IndexTuple itup, + int *nremaining); /* @@ -158,7 +160,7 @@ btbuildempty(Relation index) /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, P_NONE, 0); + _bt_initmetapage(metapage, P_NONE, 0, _bt_opclasses_support_dedup(index)); /* * Write the page and log it. It might seem that an immediate sync would @@ -261,8 +263,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) */ if (so->killedItems == NULL) so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) + palloc(MaxBTreeIndexTuplesPerPage * sizeof(int)); + if (so->numKilled < MaxBTreeIndexTuplesPerPage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -1151,8 +1153,17 @@ restart: } else if (P_ISLEAF(opaque)) { + /* Deletable item state */ OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; + int nhtidsdead; + int nhtidslive; + + /* Updatable item state (for posting lists) */ + IndexTuple updated[MaxOffsetNumber]; + OffsetNumber updatable[MaxOffsetNumber]; + int nupdatable; + OffsetNumber offnum, minoff, maxoff; @@ -1185,6 +1196,10 @@ restart: * callback function. */ ndeletable = 0; + nupdatable = 0; + /* Maintain stats counters for index tuple versions/heap TIDs */ + nhtidsdead = 0; + nhtidslive = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1194,11 +1209,9 @@ restart: offnum = OffsetNumberNext(offnum)) { IndexTuple itup; - ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); /* * During Hot Standby we currently assume that it's okay that @@ -1221,8 +1234,71 @@ restart: * applies to *any* type of index that marks index tuples as * killed. */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + if (!BTreeTupleIsPosting(itup)) + { + /* Regular tuple, standard heap TID representation */ + ItemPointer htid = &(itup->t_tid); + + if (callback(htid, callback_state)) + { + deletable[ndeletable++] = offnum; + nhtidsdead++; + } + else + nhtidslive++; + } + else + { + ItemPointer newhtids; + int nremaining; + + /* + * Posting list tuple, a physical tuple that represents + * two or more logical tuples, any of which could be an + * index row version that must be removed + */ + newhtids = btreevacuumposting(vstate, itup, &nremaining); + if (newhtids == NULL) + { + /* + * All TIDs/logical tuples from the posting tuple + * remain, so no update or delete required + */ + Assert(nremaining == BTreeTupleGetNPosting(itup)); + } + else if (nremaining > 0) + { + IndexTuple updatedtuple; + + /* + * Form new tuple that contains only remaining TIDs. + * Remember this tuple and the offset of the old tuple + * for when we update it in place + */ + Assert(nremaining < BTreeTupleGetNPosting(itup)); + updatedtuple = _bt_form_posting(itup, newhtids, + nremaining); + updated[nupdatable] = updatedtuple; + updatable[nupdatable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining; + pfree(newhtids); + } + else + { + /* + * All TIDs/logical tuples from the posting list must + * be deleted. We'll delete the physical tuple + * completely. + */ + deletable[ndeletable++] = offnum; + nhtidsdead += BTreeTupleGetNPosting(itup); + + /* Free empty array of live items */ + pfree(newhtids); + } + + nhtidslive += nremaining; + } } } @@ -1230,11 +1306,12 @@ restart: * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nupdatable > 0) { - _bt_delitems_vacuum(rel, buf, deletable, ndeletable); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, + updated, nupdatable); - stats->tuples_removed += ndeletable; + stats->tuples_removed += nhtidsdead; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } @@ -1249,6 +1326,7 @@ restart: * We treat this like a hint-bit update because there's no need to * WAL-log it. */ + Assert(nhtidsdead == 0); if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { @@ -1258,15 +1336,16 @@ restart: } /* - * If it's now empty, try to delete; else count the live tuples. We - * don't delete when recursing, though, to avoid putting entries into + * If it's now empty, try to delete; else count the live tuples (live + * heap TIDs in posting lists are counted as live tuples). We don't + * delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else - stats->num_index_tuples += maxoff - minoff + 1; + stats->num_index_tuples += nhtidslive; } if (delete_now) @@ -1309,6 +1388,68 @@ restart: } } +/* + * btreevacuumposting() -- determines which logical tuples must remain when + * VACUUMing a posting list tuple. + * + * Returns new palloc'd array of item pointers needed to build replacement + * posting list without the index row versions that are to be deleted. + * + * Note that returned array is NULL in the common case where there is nothing + * to delete in caller's posting list tuple. The number of TIDs that should + * remain in the posting list tuple is set for caller in *nremaining. This is + * also the size of the returned array (though only when array isn't just + * NULL). + */ +static ItemPointer +btreevacuumposting(BTVacState *vstate, IndexTuple itup, int *nremaining) +{ + int live = 0; + int nitem = BTreeTupleGetNPosting(itup); + ItemPointer tmpitems = NULL, + items = BTreeTupleGetPosting(itup); + + Assert(BTreeTupleIsPosting(itup)); + + /* + * Check each tuple in the posting list. Save live tuples into tmpitems, + * though try to avoid memory allocation as an optimization. + */ + for (int i = 0; i < nitem; i++) + { + if (!vstate->callback(items + i, vstate->callback_state)) + { + /* + * Live heap TID. + * + * Only save live TID when we know that we're going to have to + * kill at least one TID, and have already allocated memory. + */ + if (tmpitems) + tmpitems[live] = items[i]; + live++; + } + + /* Dead heap TID */ + else if (tmpitems == NULL) + { + /* + * Turns out we need to delete one or more dead heap TIDs, so + * start maintaining an array of live TIDs for caller to + * reconstruct smaller replacement posting list tuple + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + + /* Copy live heap TIDs from previous loop iterations */ + if (live > 0) + memcpy(tmpitems, items, sizeof(ItemPointerData) * live); + } + } + + *nremaining = live; + return tmpitems; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 8e512461a0..c954926f2d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,10 +26,18 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static void _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer heapTid, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, + ItemPointer heapTid); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -434,7 +442,10 @@ _bt_binsrch(Relation rel, * low) makes bounds invalid. * * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time. + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by their + * scantid/new heap TID). */ OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate) @@ -453,6 +464,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) Assert(P_ISLEAF(opaque)); Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); if (!insertstate->bounds_valid) { @@ -509,6 +521,16 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) if (result != 0) stricthigh = high; } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); } /* @@ -528,6 +550,68 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + /* + * In the unlikely event that posting list tuple has LP_DEAD bit set, + * signal to caller that it should kill the item and restart its binary + * search. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res > 0) + low = mid + 1; + else if (res < 0) + high = mid; + else + return mid; + } + + /* Exact match not found */ + return low; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -537,9 +621,14 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the @@ -563,6 +652,7 @@ _bt_compare(Relation rel, ScanKey scankey; int ncmpkey; int ntupatts; + int32 result; Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); @@ -597,7 +687,6 @@ _bt_compare(Relation rel, { Datum datum; bool isNull; - int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); @@ -713,8 +802,25 @@ _bt_compare(Relation rel, if (heapTid == NULL) return 1; + /* + * scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * as a simple scalar value. + */ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; } /* @@ -1230,6 +1336,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* Initialize remaining insertion scan key fields */ inskey.heapkeyspace = _bt_heapkeyspace(rel); + inskey.safededup = false; /* unused */ inskey.anynullkeys = false; /* unused */ inskey.nextkey = nextkey; inskey.pivotsearch = false; @@ -1451,6 +1558,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; + so->currPos.postingTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be @@ -1484,9 +1592,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { - /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + /* + * Set up state to return posting list, and remember first + * "logical" tuple + */ + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional logical tuples */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i)); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1519,7 +1649,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxBTreeIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1527,7 +1657,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxBTreeIndexTuplesPerPage; offnum = Min(offnum, maxoff); @@ -1568,9 +1698,37 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) &continuescan); if (passes_quals && tuple_alive) { - /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int i = BTreeTupleGetNPosting(itup) - 1; + + /* + * Set up state to return posting list, and remember last + * "logical" tuple (since we'll return it first) + */ + itemIndex--; + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i--), + itup); + + /* + * Remember additional logical tuples (use desc order to + * be consistent with order of entire scan) + */ + for (; i >= 0; i--) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i)); + } + } } if (!continuescan) { @@ -1584,8 +1742,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxBTreeIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxBTreeIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1598,6 +1756,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1610,6 +1770,64 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* + * Setup state to save posting items from a single posting list tuple. Saves + * the logical tuple that will be returned to scan first in passing. + * + * Saves an index item into so->currPos.items[itemIndex] for logical tuple + * that is returned to scan first. Second or subsequent heap TID for posting + * list should be saved by calling _bt_savepostingitem(). + */ +static void +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + if (so->currTuples) + { + /* Save base IndexTuple (truncate posting list) */ + IndexTuple base; + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset); + memcpy(base, itup, itupsz); + /* Defensively reduce work area index tuple header size */ + base->t_info &= ~INDEX_SIZE_MASK; + base->t_info |= itupsz; + so->currPos.nextTupleOffset += itupsz; + so->currPos.postingTupleOffset = currItem->tupleOffset; + } +} + +/* + * Save an index item into so->currPos.items[itemIndex] for posting tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer heapTid) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *heapTid; + currItem->indexOffset = offnum; + + /* + * Have index-only scans return the same base IndexTuple for every logical + * tuple that originates from the same posting list + */ + if (so->currTuples) + currItem->tupleOffset = so->currPos.postingTupleOffset; +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 1dd39a9535..b40559d45f 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -243,6 +243,7 @@ typedef struct BTPageState BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ + Size btps_lastextra; /* last item's extra posting list space */ uint32 btps_level; /* tree level (0 = leaf) */ Size btps_full; /* "full" if less than this much free space */ struct BTPageState *btps_next; /* link to parent level, if any */ @@ -277,7 +278,10 @@ static void _bt_slideleft(Page page); static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, - IndexTuple itup); + IndexTuple itup, Size truncextra); +static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, + BTPageState *state, + BTDedupState dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); @@ -711,6 +715,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) state->btps_lowkey = NULL; /* initialize lastoff so first item goes into P_FIRSTKEY */ state->btps_lastoff = P_HIKEY; + state->btps_lastextra = 0; state->btps_level = level; /* set "full" threshold based on level. See notes at head of file. */ if (level > 0) @@ -789,7 +794,8 @@ _bt_sortaddtup(Page page, } /*---------- - * Add an item to a disk page from the sort output. + * Add an item to a disk page from the sort output (or add a posting list + * item formed from the sort output). * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. @@ -821,14 +827,27 @@ _bt_sortaddtup(Page page, * the truncated high key at offset 1. * * 'last' pointer indicates the last offset added to the page. + * + * 'truncextra' is the size of the posting list in itup, if any. This + * information is stashed for the next call here, when we may benefit + * from considering the impact of truncating away the posting list on + * the page before deciding to finish the page off. Posting lists are + * often relatively large, so it is worth going to the trouble of + * accounting for the saving from truncating away the posting list of + * the tuple that becomes the high key (that may be the only way to + * get close to target free space on the page). Note that this is + * only used for the soft fillfactor-wise limit, not the critical hard + * limit. *---------- */ static void -_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) +_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, + Size truncextra) { Page npage; BlockNumber nblkno; OffsetNumber last_off; + Size last_truncextra; Size pgspc; Size itupsz; bool isleaf; @@ -842,6 +861,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; + last_truncextra = state->btps_lastextra; + state->btps_lastextra = truncextra; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleSize(itup); @@ -883,10 +904,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * page. Disregard fillfactor and insert on "full" current page if we * don't have the minimum number of items yet. (Note that we deliberately * assume that suffix truncation neither enlarges nor shrinks new high key - * when applying soft limit.) + * when applying soft limit, except when last tuple had a posting list.) */ if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || - (pgspc < state->btps_full && last_off > P_FIRSTKEY)) + (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. @@ -944,11 +965,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * We don't try to bias our choice of split point to make it more * likely that _bt_truncate() can truncate away more attributes, * whereas the split point used within _bt_split() is chosen much - * more delicately. Suffix truncation is mostly useful because it - * improves space utilization for workloads with random - * insertions. It doesn't seem worthwhile to add logic for - * choosing a split point here for a benefit that is bound to be - * much smaller. + * more delicately. On the other hand, non-unique index builds + * usually deduplicate, which often results in every "physical" + * tuple on the page having distinct key values. When that + * happens, _bt_truncate() will never need to include a heap TID + * in the new high key. * * Overwrite the old item with new truncated high key directly. * oitup is already located at the physical beginning of tuple @@ -983,7 +1004,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); BTreeInnerTupleSetDownLink(state->btps_lowkey, oblkno); - _bt_buildadd(wstate, state->btps_next, state->btps_lowkey); + _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); pfree(state->btps_lowkey); /* @@ -1045,6 +1066,47 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) state->btps_lastoff = last_off; } +/* + * Finalize pending posting list tuple, and add it to the index. Final tuple + * is based on saved base tuple, and saved list of heap TIDs. + * + * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple + * using _bt_buildadd() and does not maintain the intervals array. + */ +static void +_bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, + BTDedupState dstate) +{ + IndexTuple final; + Size truncextra; + + Assert(dstate->nitems > 0); + truncextra = 0; + if (dstate->nitems == 1) + final = dstate->base; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = _bt_form_posting(dstate->base, + dstate->htids, + dstate->nhtids); + final = postingtuple; + /* Determine size of posting list */ + truncextra = IndexTupleSize(final) - + BTreeTupleGetPostingOffset(final); + } + + _bt_buildadd(wstate, state, final, truncextra); + + if (dstate->nitems > 1) + pfree(final); + /* Don't maintain dedup_intervals array, or alltupsize */ + dstate->nhtids = 0; + dstate->nitems = 0; +} + /* * Finish writing out the completed btree. */ @@ -1090,7 +1152,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || !P_LEFTMOST(opaque)); BTreeInnerTupleSetDownLink(s->btps_lowkey, blkno); - _bt_buildadd(wstate, s->btps_next, s->btps_lowkey); + _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); pfree(s->btps_lowkey); s->btps_lowkey = NULL; } @@ -1111,7 +1173,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); - _bt_initmetapage(metapage, rootblkno, rootlevel); + _bt_initmetapage(metapage, rootblkno, rootlevel, + wstate->inskey->safededup); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } @@ -1132,6 +1195,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->safededup && BTGetUseDedup(wstate->index); if (merge) { @@ -1228,12 +1294,12 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (load1) { - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { - _bt_buildadd(wstate, state, itup2); + _bt_buildadd(wstate, state, itup2, 0); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } @@ -1243,9 +1309,113 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } pfree(sortKeys); } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + IndexTuple newbase; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->maxitemsize = 0; /* set later */ + dstate->checkingunique = false; /* unused */ + dstate->skippedbase = InvalidOffsetNumber; + dstate->newitem = NULL; + /* Metadata about current pending posting list */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->overlap = false; + dstate->alltupsize = 0; /* unused */ + /* Metadata about based tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to the size of the free + * space we want to leave behind on the page, plus space for + * final item's line pointer (but make sure that posting list + * tuple size won't exceed the generic 1/3 of a page limit). + * + * This is more conservative than the approach taken in the + * retail insert path, but it allows us to get most of the + * space savings deduplication provides without noticeably + * impacting how much free space is left behind on each leaf + * page. + */ + dstate->maxitemsize = + Min(BTMaxItemSize(state->btps_page), + MAXALIGN_DOWN(state->btps_full) - sizeof(ItemIdData)); + /* Minimum posting tuple size used here is arbitrary: */ + dstate->maxitemsize = Max(dstate->maxitemsize, 100); + dstate->htids = palloc(dstate->maxitemsize); + + /* + * No previous/base tuple, since itup is the first item + * returned by the tuplesort -- use itup as base tuple of + * first pending posting list for entire index build + */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list, and + * merging itup into pending posting list won't exceed the + * maxitemsize limit. Heap TID(s) for itup have been saved in + * state. The next iteration will also end up here if it's + * possible to merge the next tuple into the same pending + * posting list. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * maxitemsize limit was reached + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + + /* itup starts new pending posting list */ + newbase = CopyIndexTuple(itup); + _bt_dedup_start_pending(dstate, newbase, InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + /* + * Handle the last item (there must be a last item when the tuplesort + * returned one or more tuples) + */ + if (state) + { + _bt_sort_dedup_finish_pending(wstate, state, dstate); + /* Base tuple is always a copy */ + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } else { - /* merge is unnecessary */ + /* merging and deduplication are both unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { @@ -1253,7 +1423,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) if (state == NULL) state = _bt_pagestate(wstate, 0); - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup, 0); /* Report progress */ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 29167f1ef5..ffec42e78a 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -51,6 +51,7 @@ typedef struct Size newitemsz; /* size of newitem (includes line pointer) */ bool is_leaf; /* T if splitting a leaf page */ bool is_rightmost; /* T if splitting rightmost page on level */ + bool is_deduped; /* T if posting list truncation expected */ OffsetNumber newitemoff; /* where the new item is to be inserted */ int leftspace; /* space available for items on left page */ int rightspace; /* space available for items on right page */ @@ -177,12 +178,16 @@ _bt_findsplitloc(Relation rel, state.newitemsz = newitemsz; state.is_leaf = P_ISLEAF(opaque); state.is_rightmost = P_RIGHTMOST(opaque); + state.is_deduped = state.is_leaf && BTGetUseDedup(rel); state.leftspace = leftspace; state.rightspace = rightspace; state.olddataitemstotal = olddataitemstotal; state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + /* * maxsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once @@ -459,6 +464,7 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsz = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ @@ -468,8 +474,31 @@ _bt_recsplitloc(FindSplitData *state, if (newitemisfirstonright) firstrightitemsz = state->newitemsz; else + { firstrightitemsz = firstoldonrightsz; + /* + * Calculate suffix truncation space saving when firstright is a + * posting list tuple. + * + * Individual posting lists often take up a significant fraction of + * all space on a page. Failing to consider that the new high key + * won't need to store the posting list a second time really matters. + */ + if (state->is_leaf && state->is_deduped) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsz = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -492,9 +521,11 @@ _bt_recsplitloc(FindSplitData *state, * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and * will rarely be larger, but conservatively assume the worst case. + * Truncation always truncates away any posting list that appears in the + * first right tuple, though, so it's safe to subtract that overhead. */ if (state->is_leaf) - leftfree -= (int16) (firstrightitemsz + + leftfree -= (int16) ((firstrightitemsz - postingsz) + MAXALIGN(sizeof(ItemPointerData))); else leftfree -= (int16) firstrightitemsz; @@ -691,7 +722,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->page, itemid); /* Do cheaper test first */ - if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index ee972a1465..cb6a5b9335 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -20,6 +20,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" +#include "catalog/catalog.h" #include "commands/progress.h" #include "lib/qunique.h" #include "miscadmin.h" @@ -98,8 +99,6 @@ _bt_mkscankey(Relation rel, IndexTuple itup) indoption = rel->rd_indoption; tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); - /* * We'll execute search using scan key constructed on key columns. * Truncated attributes and non-key attributes are omitted from the final @@ -108,12 +107,25 @@ _bt_mkscankey(Relation rel, IndexTuple itup) key = palloc(offsetof(BTScanInsertData, scankeys) + sizeof(ScanKeyData) * indnkeyatts); key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel); + key->safededup = itup == NULL ? _bt_opclasses_support_dedup(rel) : + _bt_safededup(rel); key->anynullkeys = false; /* initial assumption */ key->nextkey = false; key->pivotsearch = false; + key->scantid = NULL; key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + Assert(!itup || !BTreeTupleIsPosting(itup) || key->heapkeyspace); + + /* + * When caller passes a tuple with a heap TID, use it to set scantid. Note + * that this handles posting list tuples by setting scantid to the lowest + * heap TID in the posting list. + */ + if (itup && key->heapkeyspace) + key->scantid = BTreeTupleGetHeapTID(itup); + skey = key->scankeys; for (i = 0; i < indnkeyatts; i++) { @@ -1373,6 +1385,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1534,6 +1547,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) break; @@ -1773,10 +1787,35 @@ _bt_killitems(IndexScanDesc scan) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + if (BTreeTupleIsPosting(ituple)) { - /* found the item */ + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* Read-ahead to later kitems */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + if (killtuple) + { + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ @@ -2017,7 +2056,9 @@ btoptions(Datum reloptions, bool validate) static const relopt_parse_elt tab[] = { {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, - offsetof(BTOptions, vacuum_cleanup_index_scale_factor)} + offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, + {"deduplication", RELOPT_TYPE_BOOL, + offsetof(BTOptions, deduplication)} }; @@ -2138,6 +2179,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(firstright)) + { + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetNAtts(pivot, keepnatts); + if (keepnatts == natts) + { + /* + * index_truncate_tuple() just returned a copy of the + * original, so make sure that the size of the new pivot tuple + * doesn't have posting list overhead + */ + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + } + + Assert(!BTreeTupleIsPosting(pivot)); + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2154,6 +2213,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft) && + !BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2161,6 +2222,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pfree(pivot); pivot = tidpivot; } + else if (BTreeTupleIsPosting(firstright)) + { + /* + * No truncation was possible, since key attributes are all equal. We + * can always truncate away a posting list, though. + * + * It's necessary to add a heap TID attribute to the new pivot tuple. + */ + newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright)); + + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetAltHeapTID(pivot); + } else { /* @@ -2186,6 +2265,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * nbtree (e.g., there is no pg_attribute entry). */ Assert(itup_key->heapkeyspace); + Assert(!BTreeTupleIsPosting(pivot)); pivot->t_info &= ~INDEX_SIZE_MASK; pivot->t_info |= newsize; @@ -2198,7 +2278,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2209,9 +2289,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2224,7 +2307,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2233,7 +2316,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2314,13 +2398,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal after detoasting. + * unless they're bitwise equal after detoasting. When an index is considered + * deduplication-safe by _bt_opclasses_support_dedup, routine is guaranteed to + * give the same result as _bt_keep_natts would. * - * These issues must be acceptable to callers, typically because they're only - * concerned about making suffix truncation as effective as possible without - * leaving excessive amounts of free space on either side of page split. - * Callers can rely on the fact that attributes considered equal here are - * definitely also equal according to _bt_keep_natts. + * Suffix truncation callers can rely on the fact that attributes considered + * equal here are definitely also equal according to _bt_keep_natts, even when + * the index uses an opclass or collation that is not deduplication-safe. + * This weaker guarantee is good enough for these callers, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2398,22 +2485,30 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); tupnatts = BTreeTupleGetNAtts(itup, rel); + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + if (P_ISLEAF(opaque)) { if (offnum >= P_FIRSTDATAKEY(opaque)) { /* - * Non-pivot tuples currently never use alternative heap TID - * representation -- even those within heapkeyspace indexes + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated. (Note that tupnatts must have been - * inferred, rather than coming from an explicit on-disk - * representation.) + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) */ return tupnatts == natts; } @@ -2457,12 +2552,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * non-zero, or when there is no explicit representation and the * tuple is evidently not a pre-pg_upgrade tuple. * - * Prior to v11, downlinks always had P_HIKEY as their offset. Use - * that to decide if the tuple is a pre-v11 tuple. + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && - ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; } else { @@ -2488,7 +2583,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) return false; /* @@ -2558,11 +2657,54 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, BTMaxItemSizeNoHeapTid(page), RelationGetRelationName(rel)), errdetail("Index row references tuple (%u,%u) in relation \"%s\".", - ItemPointerGetBlockNumber(&newtup->t_tid), - ItemPointerGetOffsetNumber(&newtup->t_tid), + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), RelationGetRelationName(heap)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(heap, RelationGetRelationName(rel)))); } + +/* + * Is it safe to perform deduplication for an index, given the opclasses and + * collations used? + * + * Returned value is stored in index metapage during index builds. Function + * does not account for incompatibilities caused by index being on an earlier + * nbtree version. + */ +bool +_bt_opclasses_support_dedup(Relation index) +{ + /* INCLUDE indexes don't support deduplication */ + if (IndexRelationGetNumberOfAttributes(index) != + IndexRelationGetNumberOfKeyAttributes(index)) + return false; + + /* + * There is no reason why deduplication cannot be used with system catalog + * indexes. However, we deem it generally unsafe because it's not clear + * how it could be disabled. (ALTER INDEX is not supported with system + * catalog indexes, so users have no way to set the "deduplicate" storage + * parameter.) + */ + if (IsCatalogRelation(index)) + return false; + + for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++) + { + Oid opfamily = index->rd_opfamily[i]; + Oid collation = index->rd_indcollation[i]; + + /* TODO add adequate check of opclasses and collations */ + elog(DEBUG4, "index %s column i %d opfamilyOid %u collationOid %u", + RelationGetRelationName(index), i, opfamily, collation); + + /* NUMERIC btree opfamily OID is 1988 */ + if (opfamily == 1988) + return false; + } + + return true; +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 72a601bb22..191ab63a9b 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -22,6 +22,9 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "storage/procarray.h" +#include "utils/memutils.h" + +static MemoryContext opCtx; /* working memory for operations */ /* * _bt_restore_page -- re-enter all the index tuples on a page @@ -111,6 +114,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) Assert(md->btm_version >= BTREE_NOVAC_VERSION); md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; + md->btm_safededup = xlrec->btm_safededup; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -156,7 +160,8 @@ _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) } static void -btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) +btree_xlog_insert(bool isleaf, bool ismeta, bool posting, + XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); @@ -181,9 +186,52 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) - elog(PANIC, "btree_xlog_insert: failed to add item"); + if (likely(!posting)) + { + /* Simple retail insertion */ + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add item"); + } + else + { + ItemId itemid; + IndexTuple oposting, + newitem, + nposting; + uint16 postingoff; + + /* + * A posting list split occurred during leaf page insertion. WAL + * record data will start with an offset number representing the + * point in an existing posting list that a split occurs at. + * + * Use _bt_swap_posting() to repeat posting list split steps from + * primary. Note that newitem from WAL record is 'orignewitem', + * not the final version of newitem that is actually inserted on + * page. + */ + postingoff = *((uint16 *) datapos); + datapos += sizeof(uint16); + datalen -= sizeof(uint16); + + itemid = PageGetItemId(page, OffsetNumberPrev(xlrec->offnum)); + oposting = (IndexTuple) PageGetItem(page, itemid); + + /* newitem must be mutable copy for _bt_swap_posting() */ + Assert(isleaf && postingoff > 0); + newitem = CopyIndexTuple((IndexTuple) datapos); + nposting = _bt_swap_posting(newitem, oposting, postingoff); + + /* Replace existing posting list with post-split version */ + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + + /* insert "final" new item (not orignewitem from WAL stream) */ + Assert(IndexTupleSize(newitem) == datalen); + if (PageAddItem(page, (Item) newitem, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add posting split new item"); + } PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -265,20 +313,38 @@ btree_xlog_split(bool onleft, XLogReaderState *record) BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; IndexTuple newitem = NULL, - left_hikey = NULL; + left_hikey = NULL, + nposting = NULL; Size newitemsz = 0, left_hikeysz = 0; Page newlpage; - OffsetNumber leftoff; + OffsetNumber leftoff, + replacepostingoff = InvalidOffsetNumber; datapos = XLogRecGetBlockData(record, 0, &datalen); - if (onleft) + if (onleft || xlrec->postingoff != 0) { newitem = (IndexTuple) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; + + if (xlrec->postingoff != 0) + { + ItemId itemid; + IndexTuple oposting; + + /* Posting list must be at offset number before new item's */ + replacepostingoff = OffsetNumberPrev(xlrec->newitemoff); + + /* newitem must be mutable copy for _bt_swap_posting() */ + newitem = CopyIndexTuple(newitem); + itemid = PageGetItemId(lpage, replacepostingoff); + oposting = (IndexTuple) PageGetItem(lpage, itemid); + nposting = _bt_swap_posting(newitem, oposting, + xlrec->postingoff); + } } /* Extract left hikey and its size (assuming 16-bit alignment) */ @@ -304,8 +370,20 @@ btree_xlog_split(bool onleft, XLogReaderState *record) Size itemsz; IndexTuple item; + /* Add replacement posting list when required */ + if (off == replacepostingoff) + { + Assert(onleft || xlrec->firstright == xlrec->newitemoff); + if (PageAddItem(newlpage, (Item) nposting, + MAXALIGN(IndexTupleSize(nposting)), leftoff, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new posting list item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; + } + /* add the new item if it was inserted on left page */ - if (onleft && off == xlrec->newitemoff) + else if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) @@ -379,6 +457,84 @@ btree_xlog_split(bool onleft, XLogReaderState *record) } } +static void +btree_xlog_dedup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buf; + xl_btree_dedup *xlrec = (xl_btree_dedup *) XLogRecGetData(record); + + if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO) + { + /* + * Initialize a temporary empty page and copy all the items to that in + * item number order. + */ + Page page = (Page) BufferGetPage(buf); + OffsetNumber offnum; + BTDedupState state; + + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + + state->maxitemsize = BTMaxItemSize(page); + state->checkingunique = false; /* unused */ + state->skippedbase = InvalidOffsetNumber; + state->newitem = NULL; + /* Metadata about current pending posting list */ + state->htids = NULL; + state->nhtids = 0; + state->nitems = 0; + state->alltupsize = 0; + state->overlap = false; + /* Metadata about based tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + + /* Conservatively size array */ + state->htids = palloc(state->maxitemsize); + + /* + * Iterate over tuples on the page belonging to the interval to + * deduplicate them into a posting list. + */ + for (offnum = xlrec->baseoff; + offnum < xlrec->baseoff + xlrec->nitems; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == xlrec->baseoff) + { + /* + * No previous/base tuple for first data item -- use first + * data item as base tuple of first pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else + { + /* Heap TID(s) for itup will be saved in state */ + if (!_bt_dedup_save_htid(state, itup)) + elog(ERROR, "could not add heap tid to pending posting list"); + } + } + + Assert(state->nitems == xlrec->nitems); + /* Handle the last item */ + _bt_dedup_finish_pending(buf, state, false); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); +} + static void btree_xlog_vacuum(XLogReaderState *record) { @@ -395,7 +551,38 @@ btree_xlog_vacuum(XLogReaderState *record) page = (Page) BufferGetPage(buffer); - PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); + /* + * Must update posting list tuples before deleting whole items, since + * offset numbers are based on original page contents + */ + if (xlrec->nupdated > 0) + { + OffsetNumber *updatedoffsets; + IndexTuple updated; + Size itemsz; + + updatedoffsets = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + updated = (IndexTuple) ((char *) updatedoffsets + + xlrec->nupdated * sizeof(OffsetNumber)); + + /* Handle posting tuples */ + for (int i = 0; i < xlrec->nupdated; i++) + { + PageIndexTupleDelete(page, updatedoffsets[i]); + + itemsz = MAXALIGN(IndexTupleSize(updated)); + + if (PageAddItem(page, (Item) updated, itemsz, updatedoffsets[i], + false, false) == InvalidOffsetNumber) + elog(PANIC, "failed to add updated posting list item"); + + updated = (IndexTuple) ((char *) updated + itemsz); + } + } + + if (xlrec->ndeleted) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* * Mark the page as not containing any LP_DEAD items --- see comments @@ -729,17 +916,22 @@ void btree_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + MemoryContext oldCtx; + oldCtx = MemoryContextSwitchTo(opCtx); switch (info) { case XLOG_BTREE_INSERT_LEAF: - btree_xlog_insert(true, false, record); + btree_xlog_insert(true, false, false, record); break; case XLOG_BTREE_INSERT_UPPER: - btree_xlog_insert(false, false, record); + btree_xlog_insert(false, false, false, record); break; case XLOG_BTREE_INSERT_META: - btree_xlog_insert(false, true, record); + btree_xlog_insert(false, true, false, record); + break; + case XLOG_BTREE_INSERT_POST: + btree_xlog_insert(true, false, true, record); break; case XLOG_BTREE_SPLIT_L: btree_xlog_split(true, record); @@ -747,6 +939,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_SPLIT_R: btree_xlog_split(false, record); break; + case XLOG_BTREE_DEDUP_PAGE: + btree_xlog_dedup(record); + break; case XLOG_BTREE_VACUUM: btree_xlog_vacuum(record); break; @@ -772,6 +967,23 @@ btree_redo(XLogReaderState *record) default: elog(PANIC, "btree_redo: unknown op code %u", info); } + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(opCtx); +} + +void +btree_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Btree recovery temporary context", + ALLOCSET_DEFAULT_SIZES); +} + +void +btree_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 497f8dc77e..23e951aa9e 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -27,6 +27,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) case XLOG_BTREE_INSERT_LEAF: case XLOG_BTREE_INSERT_UPPER: case XLOG_BTREE_INSERT_META: + case XLOG_BTREE_INSERT_POST: { xl_btree_insert *xlrec = (xl_btree_insert *) rec; @@ -38,15 +39,27 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; - appendStringInfo(buf, "level %u, firstright %d, newitemoff %d", - xlrec->level, xlrec->firstright, xlrec->newitemoff); + appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", + xlrec->level, + xlrec->firstright, + xlrec->newitemoff, + xlrec->postingoff); + break; + } + case XLOG_BTREE_DEDUP_PAGE: + { + xl_btree_dedup *xlrec = (xl_btree_dedup *) rec; + + appendStringInfo(buf, "baseoff %u; nitems %u", + xlrec->baseoff, xlrec->nitems); break; } case XLOG_BTREE_VACUUM: { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "ndeleted %u", xlrec->ndeleted); + appendStringInfo(buf, "ndeleted %u; nupdated %u", + xlrec->ndeleted, xlrec->nupdated); break; } case XLOG_BTREE_DELETE: @@ -130,6 +143,12 @@ btree_identify(uint8 info) case XLOG_BTREE_SPLIT_R: id = "SPLIT_R"; break; + case XLOG_BTREE_DEDUP_PAGE: + id = "DEDUPLICATE"; + break; + case XLOG_BTREE_INSERT_POST: + id = "INSERT_POST"; + break; case XLOG_BTREE_VACUUM: id = "VACUUM"; break; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ba4edde71a..6b5d36de57 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/rmgr.h" #include "access/tableam.h" #include "access/transam.h" @@ -363,6 +364,23 @@ static const struct config_enum_entry backslash_quote_options[] = { {NULL, 0, false} }; +/* + * Although only "on", "off", and "nonunique" are documented, we accept all + * the likely variants of "on" and "off". + */ +static const struct config_enum_entry btree_deduplication_options[] = { + {"off", DEDUP_OFF, false}, + {"on", DEDUP_ON, false}, + {"nonunique", DEDUP_NONUNIQUE, false}, + {"false", DEDUP_OFF, true}, + {"true", DEDUP_ON, true}, + {"no", DEDUP_OFF, true}, + {"yes", DEDUP_ON, true}, + {"0", DEDUP_OFF, true}, + {"1", DEDUP_ON, true}, + {NULL, 0, false} +}; + /* * Although only "on", "off", and "partition" are documented, we * accept all the likely variants of "on" and "off". @@ -4271,6 +4289,16 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"btree_deduplication", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Enables B-tree index deduplication optimization."), + NULL + }, + &btree_deduplication, + DEDUP_NONUNIQUE, btree_deduplication_options, + NULL, NULL, NULL + }, + { {"bytea_output", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Sets the output format for bytea."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 46a06ffacd..0b8aa56b3a 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -650,6 +650,7 @@ #vacuum_cleanup_index_scale_factor = 0.1 # fraction of total number of tuples # before index cleanup, 0 always performs # index cleanup +#btree_deduplication = 'nonunique' # off, on, or nonunique #bytea_output = 'hex' # hex, escape #xmlbinary = 'base64' #xmloption = 'content' diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index df26826993..7e55c0ff90 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -1677,14 +1677,14 @@ psql_completion(const char *text, int start, int end) /* ALTER INDEX SET|RESET ( */ else if (Matches("ALTER", "INDEX", MatchAny, "RESET", "(")) COMPLETE_WITH("fillfactor", - "vacuum_cleanup_index_scale_factor", /* BTREE */ + "vacuum_cleanup_index_scale_factor", "deduplication", /* BTREE */ "fastupdate", "gin_pending_list_limit", /* GIN */ "buffering", /* GiST */ "pages_per_range", "autosummarize" /* BRIN */ ); else if (Matches("ALTER", "INDEX", MatchAny, "SET", "(")) COMPLETE_WITH("fillfactor =", - "vacuum_cleanup_index_scale_factor =", /* BTREE */ + "vacuum_cleanup_index_scale_factor =", "deduplication =", /* BTREE */ "fastupdate =", "gin_pending_list_limit =", /* GIN */ "buffering =", /* GiST */ "pages_per_range =", "autosummarize =" /* BRIN */ diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 3542545de5..8b1223a817 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -145,6 +145,7 @@ static void bt_tuple_present_callback(Relation index, ItemPointer tid, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static inline IndexTuple bt_posting_logical_tuple(IndexTuple itup, int n); static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); @@ -419,12 +420,13 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Size Bloom filter based on estimated number of tuples in index, * while conservatively assuming that each block must contain at least - * MaxIndexTuplesPerPage / 5 non-pivot tuples. (Non-leaf pages cannot - * contain non-pivot tuples. That's okay because they generally make - * up no more than about 1% of all pages in the index.) + * MaxBTreeIndexTuplesPerPage / 3 "logical" tuples. heapallindexed + * verification fingerprints posting list heap TIDs as plain non-pivot + * tuples, complete with index keys. This allows its heap scan to + * behave as if posting lists do not exist. */ total_pages = RelationGetNumberOfBlocks(rel); - total_elems = Max(total_pages * (MaxIndexTuplesPerPage / 5), + total_elems = Max(total_pages * (MaxBTreeIndexTuplesPerPage / 3), (int64) state->rel->rd_rel->reltuples); /* Random seed relies on backend srandom() call to avoid repetition */ seed = random(); @@ -924,6 +926,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -994,29 +997,72 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * "logical" posting list tuple, since the posting list itself is + * validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { + ItemPointer tid = BTreeTupleGetHeapTID(itup); char *itid, *htid; itid = psprintf("(%u,%u)", state->targetblock, offset); - htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", itid, htid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + /* + * If tuple is actually a posting list, make sure posting list TIDs + * are in order. + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(current), + ItemPointerGetOffsetNumberNoCheck(current)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("posting list heap TIDs out of order in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1074,12 +1120,32 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + /* Fingerprint all elements as distinct "logical" tuples */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + IndexTuple logtuple; + + logtuple = bt_posting_logical_tuple(itup, i); + norm = bt_normalize_tuple(state, logtuple); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != logtuple) + pfree(norm); + pfree(logtuple); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1087,7 +1153,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1127,6 +1194,9 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (state->heapkeyspace && !BTreeTupleIsPivot(itup)) + skey->scantid = BTreeTupleGetMaxHeapTID(itup); if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) @@ -1150,6 +1220,7 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + skey->scantid = scantid; /* * * Item order check * @@ -1160,15 +1231,17 @@ bt_target_page_check(BtreeCheckState *state) if (OffsetNumberNext(offset) <= max && !invariant_l_offset(state, skey, OffsetNumberNext(offset))) { + ItemPointer tid; char *itid, *htid, *nitid, *nhtid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetHeapTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1177,9 +1250,11 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + + tid = BTreeTupleGetHeapTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1189,10 +1264,10 @@ bt_target_page_check(BtreeCheckState *state) "higher index tid=%s (points to %s tid=%s) " "page lsn=%X/%X.", itid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", htid, nitid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", nhtid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); @@ -1953,10 +2028,10 @@ bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. Deduplication is performed opportunistically, + * and in general there is no guarantee about how or when it will be applied. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -1969,6 +2044,9 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) IndexTuple reformed; int i; + /* Caller should only pass "logical" non-pivot tuples here */ + Assert(!BTreeTupleIsPosting(itup) && !BTreeTupleIsPivot(itup)); + /* Easy case: It's immediately clear that tuple has no varlena datums */ if (!IndexTupleHasVarwidths(itup)) return itup; @@ -2031,6 +2109,30 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Produce palloc()'d "logical" tuple for nth posting list entry. + * + * In general, deduplication is not supposed to change the logical contents of + * an index. Multiple logical index tuples are folded together into one + * physical posting list index tuple when convenient. + * + * heapallindexed verification must normalize-away this variation in + * representation by converting posting list tuples into two or more "logical" + * tuples. Each logical tuple must be fingerprinted separately -- there must + * be one logical tuple for each corresponding Bloom filter probe during the + * heap scan. + * + * Note: Caller needs to call bt_normalize_tuple() with returned tuple. + */ +static inline IndexTuple +bt_posting_logical_tuple(IndexTuple itup, int n) +{ + Assert(BTreeTupleIsPosting(itup)); + + /* Returns non-posting-list tuple */ + return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1); +} + /* * Search for itup in index, starting from fast root page. itup must be a * non-pivot tuple. This is only supported with heapkeyspace indexes, since @@ -2087,6 +2189,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = key; + insertstate.postingoff = 0; insertstate.bounds_valid = false; insertstate.buf = lbuf; @@ -2094,7 +2197,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) offnum = _bt_binsrch_insert(state->rel, &insertstate); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.postingoff <= 0 && _bt_compare(state->rel, key, page, offnum) == 0) exists = true; _bt_relbuf(state->rel, lbuf); @@ -2548,26 +2653,25 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, } /* - * BTreeTupleGetHeapTID() wrapper that lets caller enforce that a heap TID must - * be present in cases where that is mandatory. - * - * This doesn't add much as of BTREE_VERSION 4, since the INDEX_ALT_TID_MASK - * bit is effectively a proxy for whether or not the tuple is a pivot tuple. - * It may become more useful in the future, when non-pivot tuples support their - * own alternative INDEX_ALT_TID_MASK representation. + * BTreeTupleGetHeapTID() wrapper that enforces that a heap TID is present in + * cases where that is mandatory (i.e. for non-pivot tuples). */ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); - BlockNumber targetblock = state->targetblock; + Assert(state->heapkeyspace); - if (result == NULL && nonpivot) + /* + * Make sure that tuple type (pivot vs non-pivot) matches caller's + * expectation + */ + if (BTreeTupleIsPivot(itup) == nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", - targetblock, RelationGetRelationName(state->rel)))); + state->targetblock, + RelationGetRelationName(state->rel)))); - return result; + return BTreeTupleGetHeapTID(itup); } diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 5881ea5dd6..13d9b2ff96 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -433,11 +433,130 @@ returns bool Implementation + + Internally, a B-tree index consists of a tree structure with leaf + pages. Each leaf page contains tuples that point to table entries + using a heap item pointer. Each tuple's key is considered unique + internally, since the item pointer is treated as part of the key. + + + An introduction to the btree index implementation can be found in + src/backend/access/nbtree/README. + + + + Deduplication - An introduction to the btree index implementation can be found in - src/backend/access/nbtree/README. + B-Tree supports deduplication. Existing + leaf page tuples with fully equal keys (equal prior to the heap + item pointer) are merged together into a single posting + list tuple. The keys appear only once in this + representation. A simple array of heap item pointers follows. + Posting lists are formed lazily, when a new item is + inserted that cannot fit on an existing leaf page. The immediate + goal of the deduplication process is to at least free enough space + to fit the new item; otherwise a leaf page split occurs, which + allocates a new leaf page. The key space + covered by the original leaf page is shared among the original page, + and its new right sibling page. + + + Deduplication can greatly increase index space efficiency with data + sets where each distinct key appears at least a few times on + average. It can also reduce the cost of subsequent index scans, + especially when many leaf pages must be accessed. For example, an + index on a simple integer column that uses + deduplication will have a storage size that is only about 65% of an + equivalent unoptimized index when each distinct + integer value appears three times. If each distinct + integer value appears six times, the storage overhead + can be as low as 50% of baseline. With hundreds of duplicates per + distinct value (or with larger base key values) a + storage size of about one third of the + unoptimized case is expected. There is often a direct benefit for + queries, as well as an indirect benefit due to reduced I/O during + routine vacuuming. + + + Cases that don't benefit due to having no duplicate values will + incur a small performance penalty with mixed read-write workloads. + There is no performance penalty with read-only workloads, since + reading from posting lists is at least as efficient as reading the + standard index tuple representation. + + + + + Configuring Deduplication + + + The configuration + parameter controls deduplication. By default, deduplication is + only used with non-unique indexes. The + deduplication storage parameter can be used to + override the configuration paramater for individual indexes. See + from the + CREATE INDEX documentation for details. + + + + + Unique indexes and Deduplication + + + Unique indexes can also use deduplication, despite the fact that + unique indexes do not logically contain + duplicates; implementation-level physical + duplicates may still be present. Unique indexes that are prone to + becoming bloated due to a short term burst in updates are good + candidates. VACUUM will eventually remove dead + versions of tuples from unique indexes, but it may not be possible + for it to do so before some number of unnecessary + page splits have taken place. Deduplication can prevent these page + splits from happening. Note that page splits can only be reversed + by VACUUM when the page is + completely empty, which isn't expected in this + scenario. + + + In other cases, deduplication can be effective with unique indexes + just because of the presence of many NULL values + in the unique index. The influence of must also be + considered. + + + For more information about automatic and manual vacuuming, see + . Note that the heap-only tuple + (HOT) optimization can also prevent page splits + caused only by versioned tuples rather than by insertions of new + values. + + + + Restrictions + + + Deduplication can only be used with indexes that use B-Tree + operator classes that were declared BITWISE. In + practice almost all datatypes support deduplication, though + numeric is a notable exception (the display + scale feature makes it impossible to enable deduplication + without losing useful information about equal numeric + datums). Deduplication is not supported with nondeterministic + collations, nor is it supported with INCLUDE + indexes. + + + Note that a multicolumn index is only considered to have duplicates + when there are index entries that repeat entire + combinations of values (the values stored in + each and every column must be equal). + + + diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 55669b5cad..9f371d3e3a 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -928,10 +928,11 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr nondeterministic collations give a more correct behavior, especially when considering the full power of Unicode and its many special cases, they also have some drawbacks. Foremost, their use leads - to a performance penalty. Also, certain operations are not possible with - nondeterministic collations, such as pattern matching operations. - Therefore, they should be used only in cases where they are specifically - wanted. + to a performance penalty. Note, in particular, that B-tree cannot use + deduplication with indexes that use a nondeterministic collation. Also, + certain operations are not possible with nondeterministic collations, + such as pattern matching operations. Therefore, they should be used + only in cases where they are specifically wanted. diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d4d1fe45cc..6f89e4a51f 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8000,6 +8000,39 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + btree_deduplication (enum) + + btree_deduplication + configuration parameter + + + + + Controls the use of deduplication within B-Tree indexes. + Deduplication is an optimization that reduces the storage size + of indexes by storing equal index keys only once. See for more information. + + + + In addition to off, to disable, there are + two modes: on, and + nonunique. When + btree_deduplication is set to + nonunique, the default, deduplication is + only used for non-unique B-Tree indexes. + + + + This setting can be overridden for individual B-Tree indexes + by changing index storage parameters. See from the + CREATE INDEX documentation for details. + + + + bytea_output (enum) diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml index ec8bdcd7a4..695aa9123d 100644 --- a/doc/src/sgml/maintenance.sgml +++ b/doc/src/sgml/maintenance.sgml @@ -887,6 +887,14 @@ analyze threshold = analyze base threshold + analyze scale factor * number of tu might be worthwhile to reindex periodically just to improve access speed. + + + Enabling B-tree deduplication in unique indexes can be an effective + way to control index bloat in extreme cases. See for details. + + + can be used safely and easily in all cases. This command requires an ACCESS EXCLUSIVE lock by diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 629a31ef79..abc7db4820 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -166,6 +166,8 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] @@ -388,10 +390,39 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] - B-tree indexes additionally accept this parameter: + B-tree indexes also accept these parameters: + + deduplication + + deduplication + storage parameter + + + + + Per-index value for . + Controls usage of the B-tree deduplication technique described + in . Set to + ON or OFF to override GUC. + (Alternative spellings of ON and + OFF are allowed as described in .) + + + + + Turning deduplication off via ALTER + INDEX prevents future insertions from triggering + deduplication, but does not in itself make existing posting list + tuples use the standard tuple representation. + + + + + vacuum_cleanup_index_scale_factor @@ -446,9 +477,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] . It is a Boolean parameter: ON enables fast update, OFF disables it. - (Alternative spellings of ON and OFF are - allowed as described in .) The - default is ON. + The default is ON. @@ -831,6 +860,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) WITH (fillfactor = 70); + + To create a unique index with deduplication enabled: + +CREATE UNIQUE INDEX title_idx ON films (title) WITH (deduplication = on); + + + To create a GIN index with fast updates disabled: diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index 10881ab03a..c9a5349019 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -58,8 +58,9 @@ REINDEX [ ( VERBOSE ) ] { INDEX | TABLE | SCHEMA | DATABASE | SYSTEM } [ CONCURR - You have altered a storage parameter (such as fillfactor) - for an index, and wish to ensure that the change has taken full effect. + You have altered a storage parameter (such as fillfactor or + deduplication) for an index, and wish to ensure that the change has + taken full effect. diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index f567117a46..53bcd1f30a 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -266,6 +266,22 @@ select * from btree_bpchar where f1::bpchar like 'foo%'; fool (2 rows) +-- +-- Test deduplication within a unique index +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; -- -- Test B-tree fast path (cache rightmost leaf page) optimization. -- diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 558dcae0ec..f008a5a55f 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -103,6 +103,23 @@ explain (costs off) select * from btree_bpchar where f1::bpchar like 'foo%'; select * from btree_bpchar where f1::bpchar like 'foo%'; +-- +-- Test deduplication within a unique index +-- +CREATE TABLE dedup_unique_test_table (a int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplication=on); +CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplication=off); +-- Generate enough garbage tuples in index to ensure that even the unique index +-- with deduplication enabled has to check multiple leaf pages during unique +-- checking (at least with a BLCKSZ of 8192 or less) +DO $$ +BEGIN + FOR r IN 1..1350 LOOP + DELETE FROM dedup_unique_test_table; + INSERT INTO dedup_unique_test_table SELECT 1; + END LOOP; +END$$; + -- -- Test B-tree fast path (cache rightmost leaf page) optimization. -- -- 2.17.1