From c07e06ff1ee2a0c595cdf773546c69940db73dd6 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 29 Aug 2019 14:35:35 -0700 Subject: [PATCH v11 1/2] Add deduplication to nbtree. --- contrib/amcheck/verify_nbtree.c | 128 +++++-- src/backend/access/nbtree/README | 76 +++- src/backend/access/nbtree/nbtinsert.c | 462 +++++++++++++++++++++++- src/backend/access/nbtree/nbtpage.c | 148 +++++++- src/backend/access/nbtree/nbtree.c | 147 ++++++-- src/backend/access/nbtree/nbtsearch.c | 247 ++++++++++++- src/backend/access/nbtree/nbtsort.c | 219 ++++++++++- src/backend/access/nbtree/nbtsplitloc.c | 47 ++- src/backend/access/nbtree/nbtutils.c | 264 ++++++++++++-- src/backend/access/nbtree/nbtxlog.c | 88 ++++- src/backend/access/rmgrdesc/nbtdesc.c | 10 +- src/include/access/nbtree.h | 242 +++++++++++-- src/include/access/nbtxlog.h | 36 +- src/tools/valgrind.supp | 21 ++ 14 files changed, 1957 insertions(+), 178 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 05e7d678ed..399743d4d6 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -924,6 +924,7 @@ bt_target_page_check(BtreeCheckState *state) size_t tupsize; BTScanInsert skey; bool lowersizelimit; + ItemPointer scantid; CHECK_FOR_INTERRUPTS(); @@ -994,29 +995,73 @@ bt_target_page_check(BtreeCheckState *state) /* * Readonly callers may optionally verify that non-pivot tuples can - * each be found by an independent search that starts from the root + * each be found by an independent search that starts from the root. + * Note that we deliberately don't do individual searches for each + * "logical" posting list tuple, since the posting list itself is + * validated by other checks. */ if (state->rootdescend && P_ISLEAF(topaque) && !bt_rootdescend(state, itup)) { char *itid, *htid; + ItemPointer tid = BTreeTupleGetHeapTID(itup); itid = psprintf("(%u,%u)", state->targetblock, offset); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumber(&(itup->t_tid)), - ItemPointerGetOffsetNumber(&(itup->t_tid))); + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("could not find tuple using search from root page in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", itid, htid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + /* + * If tuple is actually a posting list, make sure posting list TIDs + * are in order. + */ + if (BTreeTupleIsPosting(itup)) + { + ItemPointerData last; + ItemPointer current; + + ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last); + + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + + current = BTreeTupleGetPostingN(itup, i); + + if (ItemPointerCompare(current, &last) <= 0) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumberNoCheck(current), + ItemPointerGetOffsetNumberNoCheck(current)); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("posting list heap TIDs out of order in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + + ItemPointerCopy(current, &last); + } + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1074,12 +1119,33 @@ bt_target_page_check(BtreeCheckState *state) { IndexTuple norm; - norm = bt_normalize_tuple(state, itup); - bloom_add_element(state->filter, (unsigned char *) norm, - IndexTupleSize(norm)); - /* Be tidy */ - if (norm != itup) - pfree(norm); + if (BTreeTupleIsPosting(itup)) + { + IndexTuple onetup; + + /* Fingerprint all elements of posting tuple one by one */ + for (int i = 0; i < BTreeTupleGetNPosting(itup); i++) + { + onetup = BTreeGetNthTupleOfPosting(itup, i); + + norm = bt_normalize_tuple(state, onetup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != onetup) + pfree(norm); + pfree(onetup); + } + } + else + { + norm = bt_normalize_tuple(state, itup); + bloom_add_element(state->filter, (unsigned char *) norm, + IndexTupleSize(norm)); + /* Be tidy */ + if (norm != itup) + pfree(norm); + } } /* @@ -1087,7 +1153,8 @@ bt_target_page_check(BtreeCheckState *state) * * If there is a high key (if this is not the rightmost page on its * entire level), check that high key actually is upper bound on all - * page items. + * page items. If this is a posting list tuple, we'll need to set + * scantid to be highest TID in posting list. * * We prefer to check all items against high key rather than checking * just the last and trusting that the operator class obeys the @@ -1127,6 +1194,9 @@ bt_target_page_check(BtreeCheckState *state) * tuple. (See also: "Notes About Data Representation" in the nbtree * README.) */ + scantid = skey->scantid; + if (state->heapkeyspace && !BTreeTupleIsPivot(itup)) + skey->scantid = BTreeTupleGetMaxTID(itup); if (!P_RIGHTMOST(topaque) && !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) : invariant_l_offset(state, skey, P_HIKEY))) @@ -1150,6 +1220,7 @@ bt_target_page_check(BtreeCheckState *state) (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); } + skey->scantid = scantid; /* * * Item order check * @@ -1164,11 +1235,13 @@ bt_target_page_check(BtreeCheckState *state) *htid, *nitid, *nhtid; + ItemPointer tid; itid = psprintf("(%u,%u)", state->targetblock, offset); + tid = BTreeTupleGetHeapTID(itup); htid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); nitid = psprintf("(%u,%u)", state->targetblock, OffsetNumberNext(offset)); @@ -1177,9 +1250,11 @@ bt_target_page_check(BtreeCheckState *state) state->target, OffsetNumberNext(offset)); itup = (IndexTuple) PageGetItem(state->target, itemid); + + tid = BTreeTupleGetHeapTID(itup); nhtid = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)), - ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid))); + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid)); ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -1189,10 +1264,10 @@ bt_target_page_check(BtreeCheckState *state) "higher index tid=%s (points to %s tid=%s) " "page lsn=%X/%X.", itid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", htid, nitid, - P_ISLEAF(topaque) ? "heap" : "index", + P_ISLEAF(topaque) ? "min heap" : "index", nhtid, (uint32) (state->targetlsn >> 32), (uint32) state->targetlsn))); @@ -1953,10 +2028,10 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values, * verification. In particular, it won't try to normalize opclass-equal * datums with potentially distinct representations (e.g., btree/numeric_ops * index datums will not get their display scale normalized-away here). - * Normalization may need to be expanded to handle more cases in the future, - * though. For example, it's possible that non-pivot tuples could in the - * future have alternative logically equivalent representations due to using - * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication. + * Caller does normalization for non-pivot tuples that have a posting list, + * since dummy CREATE INDEX callback code generates new tuples with the same + * normalized representation. Deduplication is performed opportunistically, + * and in general there is no guarantee about how or when it will be applied. */ static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) @@ -2087,6 +2162,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = key; + insertstate.in_posting_offset = 0; insertstate.bounds_valid = false; insertstate.buf = lbuf; @@ -2094,7 +2170,9 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) offnum = _bt_binsrch_insert(state->rel, &insertstate); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); + /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && + insertstate.in_posting_offset <= 0 && _bt_compare(state->rel, key, page, offnum) == 0) exists = true; _bt_relbuf(state->rel, lbuf); @@ -2560,14 +2638,18 @@ static inline ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot) { - ItemPointer result = BTreeTupleGetHeapTID(itup); + ItemPointer result; BlockNumber targetblock = state->targetblock; - if (result == NULL && nonpivot) + /* Shouldn't be called with heapkeyspace index */ + Assert(state->heapkeyspace); + if (BTreeTupleIsPivot(itup) == nonpivot) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID", targetblock, RelationGetRelationName(state->rel)))); + result = BTreeTupleGetHeapTID(itup); + return result; } diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 6db203e75c..50ec9ef48c 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -432,7 +432,10 @@ because we allow LP_DEAD to be set with only a share lock (it's exactly like a hint bit for a heap tuple), but physically removing tuples requires exclusive lock. In the current code we try to remove LP_DEAD tuples when we are otherwise faced with having to split a page to do an insertion (and -hence have exclusive lock on it already). +hence have exclusive lock on it already). Deduplication can also prevent +a page split, but removing LP_DEAD tuples is the preferred approach. +(Note that posting list tuples can only have their LP_DEAD bit set when +every "logical" tuple represented within the posting list is known dead.) This leaves the index in a state where it has no entry for a dead tuple that still exists in the heap. This is not a problem for the current @@ -710,6 +713,77 @@ the fallback strategy assumes that duplicates are mostly inserted in ascending heap TID order. The page is split in a way that leaves the left half of the page mostly full, and the right half of the page mostly empty. +Notes about deduplication +------------------------- + +We deduplicate non-pivot tuples in non-unique indexes to reduce storage +overhead, and to avoid or at least delay page splits. Deduplication alters +the physical representation of tuples without changing the logical contents +of the index, and without adding overhead to read queries. Non-pivot +tuples are folded together into a single physical tuple with a posting list +(a simple array of heap TIDs with the standard item pointer format). +Deduplication is always applied lazily, at the point where it would +otherwise be necessary to perform a page split. It occurs only when +LP_DEAD items have been removed, as our last line of defense against +splitting a leaf page. We can set the LP_DEAD bit with posting list +tuples, though only when all table tuples are known dead. (Bitmap scans +cannot perform LP_DEAD bit setting, and are the common case with indexes +that contain lots of duplicates, so this downside is considered +acceptable.) + +Large groups of logical duplicates tend to appear together on the same leaf +page due to the special duplicate logic used when choosing a split point. +This facilitates lazy/dynamic deduplication. Deduplication can reliably +deduplicate a large localized group of duplicates before it can span +multiple leaf pages. Posting list tuples are subject to the same 1/3 of a +page restriction as any other tuple. + +Lazy deduplication allows the page space accounting used during page splits +to have absolutely minimal special case logic for posting lists. A posting +list can be thought of as extra payload that suffix truncation will +reliably truncate away as needed during page splits, just like non-key +columns from an INCLUDE index tuple. An incoming tuple (which might cause +a page split) can always be thought of as a non-posting-list tuple that +must be inserted alongside existing items, without needing to consider +deduplication. Most of the time, that's what actually happens: incoming +tuples are either not duplicates, or are duplicates with a heap TID that +doesn't overlap with any existing posting list tuple (lazy deduplication +avoids rewriting posting lists repeatedly when heap TIDs are inserted +slightly out of order by concurrent inserters). When the incoming tuple +really does overlap with an existing posting list, a posting list split is +performed. Posting list splits work in a way that more or less preserves +the illusion that all incoming tuples do not need to be merged with any +existing posting list tuple. + +Posting list splits work by "overriding" the details of the incoming tuple. +The heap TID of the incoming tuple is altered to make it match the +rightmost heap TID from the existing/originally overlapping posting list. +The offset number that the new/incoming tuple is to be inserted at is +incremented so that it will be inserted to the right of the existing +posting list. The insertion (or page split) operation that completes the +insert does one extra step: an in-place update of the posting list. The +update changes the posting list such that the "true" heap TID from the +original incoming tuple is now contained in the posting list. We make +space in the posting list by removing the heap TID that became the new +item. The size of the posting list won't change, and so the page split +space accounting does not need to care about posting lists. Also, overall +space utilization is improved by keeping existing posting lists large. + +The representation of posting lists is identical to the posting lists used +by GIN, so it would be straightforward to apply GIN's varbyte encoding +compression scheme to individual posting lists. Posting list compression +would break the assumptions made by posting list splits about page space +accounting, though, so it's not clear how compression could be integrated +with nbtree. Besides, posting list compression does not offer a compelling +trade-off for nbtree, since in general nbtree is optimized for consistent +performance with many concurrent readers and writers. A major goal of +nbtree's lazy approach to deduplication is to limit the performance impact +of deduplication with random updates. Even concurrent append-only inserts +of the same key value will tend to have inserts of individual index tuples +in an order that doesn't quite match heap TID order. In general, delaying +deduplication avoids many unnecessary posting list splits, and minimizes +page level fragmentation. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b84bf1c3df..bef5958465 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -47,21 +47,26 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int in_posting_offset, bool split_only_page); static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, - IndexTuple newitem); + IndexTuple newitem, IndexTuple nposting); static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); +static void _bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, + Size itemsz); +static void _bt_dedup_insert(Page page, BTDedupState *dedupState); /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * * This routine is called by the public interface routine, btinsert. - * By here, itup is filled in, including the TID. + * By here, itup is filled in, including the TID. Caller should be + * prepared for us to scribble on 'itup'. * * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or @@ -123,6 +128,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, /* PageAddItem will MAXALIGN(), but be consistent */ insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); insertstate.itup_key = itup_key; + insertstate.in_posting_offset = 0; insertstate.bounds_valid = false; insertstate.buf = InvalidBuffer; @@ -300,7 +306,7 @@ top: newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, stack, heapRel); _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack, - itup, newitemoff, false); + itup, newitemoff, insertstate.in_posting_offset, false); } else { @@ -435,6 +441,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, /* okay, we gotta fetch the heap tuple ... */ curitup = (IndexTuple) PageGetItem(page, curitemid); + Assert(!BTreeTupleIsPosting(curitup)); htid = curitup->t_tid; /* @@ -689,6 +696,7 @@ _bt_findinsertloc(Relation rel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(insertstate->buf); BTPageOpaque lpageop; + OffsetNumber location; lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -751,13 +759,23 @@ _bt_findinsertloc(Relation rel, /* * If the target page is full, see if we can obtain enough space by - * erasing LP_DEAD items + * erasing LP_DEAD items. If that doesn't work out, and if the index + * isn't a unique index, try deduplication. */ - if (PageGetFreeSpace(page) < insertstate->itemsz && - P_HAS_GARBAGE(lpageop)) + if (PageGetFreeSpace(page) < insertstate->itemsz) { - _bt_vacuum_one_page(rel, insertstate->buf, heapRel); - insertstate->bounds_valid = false; + if (P_HAS_GARBAGE(lpageop)) + { + _bt_vacuum_one_page(rel, insertstate->buf, heapRel); + insertstate->bounds_valid = false; + } + + if (!checkingunique && PageGetFreeSpace(page) < insertstate->itemsz) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, + insertstate->itemsz); + insertstate->bounds_valid = false; /* paranoia */ + } } } else @@ -839,7 +857,31 @@ _bt_findinsertloc(Relation rel, Assert(P_RIGHTMOST(lpageop) || _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); - return _bt_binsrch_insert(rel, insertstate); + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Insertion is not prepared for the case where an LP_DEAD posting list + * tuple must be split. In the unlikely event that this happens, call + * _bt_dedup_one_page() to force it to kill all LP_DEAD items. + */ + if (unlikely(insertstate->in_posting_offset == -1)) + { + _bt_dedup_one_page(rel, insertstate->buf, heapRel, 0); + Assert(!P_HAS_GARBAGE(lpageop)); + + /* Must reset insertstate ahead of new _bt_binsrch_insert() call */ + insertstate->bounds_valid = false; + insertstate->in_posting_offset = 0; + location = _bt_binsrch_insert(rel, insertstate); + + /* + * Might still have to split some other posting list now, but that + * should never be LP_DEAD + */ + Assert(insertstate->in_posting_offset >= 0); + } + + return location; } /* @@ -905,10 +947,12 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * This recursive procedure does the following things: * + * + if necessary, splits an existing posting list on page. + * This is only needed when 'in_posting_offset' is non-zero. * + if necessary, splits the target page, using 'itup_key' for * suffix truncation on leaf pages (caller passes NULL for * non-leaf pages). - * + inserts the tuple. + * + inserts the new tuple (could be from split posting list). * + if the page was split, pops the parent stack, and finds the * right place to insert the new child pointer (by walking * right using information stored in the parent stack). @@ -918,7 +962,8 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack) * * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, - * we will have dropped both the pin and the lock on the buffer. + * we will have dropped both the pin and the lock on the buffer. Caller + * should be prepared for us to scribble on 'itup'. * * This routine only performs retail tuple insertions. 'itup' should * always be either a non-highkey leaf item, or a downlink (new high @@ -936,11 +981,14 @@ _bt_insertonpg(Relation rel, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, + int in_posting_offset, bool split_only_page) { Page page; BTPageOpaque lpageop; Size itemsz; + IndexTuple nposting = NULL; + IndexTuple oposting; page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -954,6 +1002,8 @@ _bt_insertonpg(Relation rel, Assert(P_ISLEAF(lpageop) || BTreeTupleGetNAtts(itup, rel) <= IndexRelationGetNumberOfKeyAttributes(rel)); + /* retail insertions of posting list tuples are disallowed */ + Assert(!BTreeTupleIsPosting(itup)); /* The caller should've finished any incomplete splits already. */ if (P_INCOMPLETE_SPLIT(lpageop)) @@ -964,6 +1014,72 @@ _bt_insertonpg(Relation rel, itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ + /* + * Do we need to split an existing posting list item? + */ + if (in_posting_offset != 0) + { + ItemId itemid = PageGetItemId(page, newitemoff); + int nipd; + char *replacepos; + char *rightpos; + Size nbytes; + + /* + * The new tuple is a duplicate with a heap TID that falls inside the + * range of an existing posting list tuple, so split posting list. + * + * Posting list splits always replace some existing TID in the posting + * list with the new item's heap TID (based on a posting list offset + * from caller) by removing rightmost heap TID from posting list. The + * new item's heap TID is swapped with that rightmost heap TID, almost + * as if the tuple inserted never overlapped with a posting list in + * the first place. This allows the insertion and page split code to + * have minimal special case handling of posting lists. + * + * The only extra handling required is to overwrite the original + * posting list with nposting, which is guaranteed to be the same size + * as the original, keeping the page space accounting simple. This + * takes place in either the page insert or page split critical + * section. + */ + Assert(P_ISLEAF(lpageop)); + Assert(!ItemIdIsDead(itemid)); + Assert(in_posting_offset > 0); + oposting = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPosting(oposting)); + nipd = BTreeTupleGetNPosting(oposting); + Assert(in_posting_offset < nipd); + + nposting = CopyIndexTuple(oposting); + replacepos = (char *) BTreeTupleGetPostingN(nposting, in_posting_offset); + rightpos = replacepos + sizeof(ItemPointerData); + nbytes = (nipd - in_posting_offset - 1) * sizeof(ItemPointerData); + + /* + * Move item pointers in posting list to make a gap for the new item's + * heap TID (shift TIDs one place to the right, losing original + * rightmost TID). + */ + memmove(rightpos, replacepos, nbytes); + + /* + * Replace newitem's heap TID with rightmost heap TID from original + * posting list + */ + ItemPointerCopy(&itup->t_tid, (ItemPointer) replacepos); + + /* + * Copy original (not new original) posting list's last TID into new + * item + */ + ItemPointerCopy(BTreeTupleGetPostingN(oposting, nipd - 1), &itup->t_tid); + Assert(ItemPointerCompare(BTreeTupleGetMaxTID(nposting), + BTreeTupleGetHeapTID(itup)) < 0); + /* Alter new item offset, since effective new item changed */ + newitemoff = OffsetNumberNext(newitemoff); + } + /* * Do we need to split the page to fit the item on it? * @@ -996,7 +1112,8 @@ _bt_insertonpg(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup); + rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup, + nposting); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), BufferGetBlockNumber(rbuf)); @@ -1075,6 +1192,18 @@ _bt_insertonpg(Relation rel, elog(PANIC, "failed to add new item to block %u in index \"%s\"", itup_blkno, RelationGetRelationName(rel)); + if (nposting) + { + /* + * Handle a posting list split by performing an in-place update of + * the existing posting list + */ + Assert(P_ISLEAF(lpageop)); + Assert(MAXALIGN(IndexTupleSize(oposting)) == + MAXALIGN(IndexTupleSize(nposting))); + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + } + MarkBufferDirty(buf); if (BufferIsValid(metabuf)) @@ -1116,6 +1245,9 @@ _bt_insertonpg(Relation rel, XLogRecPtr recptr; xlrec.offnum = itup_off; + xlrec.postingsz = 0; + if (nposting) + xlrec.postingsz = MAXALIGN(IndexTupleSize(itup)); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); @@ -1153,6 +1285,9 @@ _bt_insertonpg(Relation rel, XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup)); + if (nposting) + XLogRegisterBufData(0, (char *) nposting, + IndexTupleSize(nposting)); recptr = XLogInsert(RM_BTREE_ID, xlinfo); @@ -1194,6 +1329,10 @@ _bt_insertonpg(Relation rel, _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) RelationSetTargetBlock(rel, cachedBlock); } + + /* be tidy */ + if (nposting) + pfree(nposting); } /* @@ -1211,10 +1350,16 @@ _bt_insertonpg(Relation rel, * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. + * + * nposting is a replacement posting for the posting list at the + * offset immediately before the new item's offset. This is needed + * when caller performed "posting list split", and corresponds to the + * same step for retail insertions that don't split the page. */ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, - OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem) + OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, + IndexTuple nposting) { Buffer rbuf; Page origpage; @@ -1236,12 +1381,20 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber firstright; OffsetNumber maxoff; OffsetNumber i; + OffsetNumber replacepostingoff = InvalidOffsetNumber; bool newitemonleft, isleaf; IndexTuple lefthikey; int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + /* + * Determine offset number of posting list that will be updated in place + * as part of split that follows a posting list split + */ + if (nposting != NULL) + replacepostingoff = OffsetNumberPrev(newitemoff); + /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back @@ -1273,6 +1426,13 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * newitemoff == firstright. In all other cases it's clear which side of * the split every tuple goes on from context. newitemonleft is usually * (but not always) redundant information. + * + * Note: In theory, the split point choice logic should operate against a + * version of the page that already replaced the posting list at offset + * replacepostingoff with nposting where applicable. We don't bother with + * that, though. Both versions of the posting list must be the same size + * and have the same key values, so this omission can't affect the split + * point chosen in practice. */ firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, newitem, &newitemonleft); @@ -1340,6 +1500,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemid = PageGetItemId(origpage, firstright); itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (firstright == replacepostingoff) + item = nposting; } /* @@ -1373,6 +1536,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); itemid = PageGetItemId(origpage, lastleftoff); lastleft = (IndexTuple) PageGetItem(origpage, itemid); + /* Behave as if origpage posting list has already been swapped */ + if (lastleftoff == replacepostingoff) + lastleft = nposting; } Assert(lastleft != item); @@ -1480,8 +1646,23 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); + /* + * did caller pass new replacement posting list tuple due to posting + * list split? + */ + if (i == replacepostingoff) + { + /* + * swap origpage posting list with post-posting-list-split version + * from caller + */ + Assert(isleaf); + Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); + item = nposting; + } + /* does new item belong before this one? */ - if (i == newitemoff) + else if (i == newitemoff) { if (newitemonleft) { @@ -1652,6 +1833,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, xlrec.level = ropaque->btpo.level; xlrec.firstright = firstright; xlrec.newitemoff = newitemoff; + xlrec.replacepostingoff = replacepostingoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit); @@ -1676,6 +1858,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, if (newitemonleft) XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); + if (replacepostingoff) + XLogRegisterBufData(0, (char *) nposting, + MAXALIGN(IndexTupleSize(nposting))); + /* Log the left page's new high key */ itemid = PageGetItemId(origpage, P_HIKEY); item = (IndexTuple) PageGetItem(origpage, itemid); @@ -1834,7 +2020,7 @@ _bt_insert_parent(Relation rel, /* Recursively insert into the parent */ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent, - new_item, stack->bts_offset + 1, + new_item, stack->bts_offset + 1, 0, is_only); /* be tidy */ @@ -2304,6 +2490,250 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) * Note: if we didn't find any LP_DEAD items, then the page's * BTP_HAS_GARBAGE hint bit is falsely set. We do not bother expending a * separate write to clear it, however. We will clear it when we split - * the page. + * the page (or when deduplication runs). */ } + +/* + * Try to deduplicate items to free some space. If we don't proceed with + * deduplication, buffer will contain old state of the page. + * + * 'itemsz' is the size of the inserter caller's incoming/new tuple, not + * including line pointer overhead. This is the amount of space we'll need to + * free in order to let caller avoid splitting the page. + * + * This function should be called after LP_DEAD items were removed by + * _bt_vacuum_one_page() to prevent a page split. (It's possible that we'll + * have to kill additional LP_DEAD items, but that should be rare.) + */ +static void +_bt_dedup_one_page(Relation rel, Buffer buffer, Relation heapRel, Size itemsz) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buffer); + Page newpage; + BTPageOpaque oopaque, + nopaque; + bool deduplicate = false; + BTDedupState *dedupState = NULL; + int natts = IndexRelationGetNumberOfAttributes(rel); + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + + /* + * Don't use deduplication for indexes with INCLUDEd columns and unique + * indexes + */ + deduplicate = (IndexRelationGetNumberOfKeyAttributes(rel) == + IndexRelationGetNumberOfAttributes(rel) && + !rel->rd_index->indisunique); + if (!deduplicate) + return; + + oopaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* init deduplication state needed to build posting tuples */ + dedupState = (BTDedupState *) palloc0(sizeof(BTDedupState)); + dedupState->ipd = NULL; + dedupState->ntuples = 0; + dedupState->itupprev = NULL; + dedupState->maxitemsize = BTMaxItemSize(page); + dedupState->maxpostingsize = 0; + + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Delete dead tuples if any. We cannot simply skip them in the cycle + * below, because it's necessary to generate special Xlog record + * containing such tuples to compute latestRemovedXid on a standby server + * later. + * + * This should not affect performance, since it only can happen in a rare + * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page + * was not called, or _bt_vacuum_one_page didn't remove all dead items. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemid)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + /* + * Skip duplication in rare cases where there were LP_DEAD items + * encountered here when that frees sufficient space for caller to + * avoid a page split + */ + _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); + if (PageGetFreeSpace(page) >= itemsz) + { + pfree(dedupState); + return; + } + + /* Continue with deduplication */ + minoff = P_FIRSTDATAKEY(oopaque); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* + * Scan over all items to see which ones can be deduplicated + */ + newpage = PageGetTempPageCopySpecial(page); + nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage); + + /* Make sure that new page won't have garbage flag set */ + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* Copy High Key if any */ + if (!P_RIGHTMOST(oopaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add highkey during deduplication"); + } + + /* + * Iterate over tuples on the page, try to deduplicate them into posting + * lists and insert into new page. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (dedupState->itupprev == NULL) + { + /* Just set up base/first item in first iteration */ + Assert(offnum == minoff); + dedupState->itupprev = CopyIndexTuple(itup); + continue; + } + + if (deduplicate && + _bt_keep_natts_fast(rel, dedupState->itupprev, itup) > natts) + { + int itup_ntuples; + Size projpostingsz; + + /* + * Tuples are equal. + * + * If posting list does not exceed tuple size limit then append + * the tuple to the pending posting list. Otherwise, insert it on + * page and continue with this tuple as new pending posting list. + */ + itup_ntuples = BTreeTupleIsPosting(itup) ? + BTreeTupleGetNPosting(itup) : 1; + + /* + * Project size of new posting list that would result from merging + * current tup with pending posting list (could just be prev item + * that's "pending"). + * + * This accounting looks odd, but it's correct because ... + */ + projpostingsz = MAXALIGN(IndexTupleSize(dedupState->itupprev) + + (dedupState->ntuples + itup_ntuples + 1) * + sizeof(ItemPointerData)); + + if (projpostingsz <= dedupState->maxitemsize) + _bt_stash_item_tid(dedupState, itup); + else + _bt_dedup_insert(newpage, dedupState); + } + else + { + /* + * Tuples are not equal, or we're done deduplicating this page. + * + * Insert pending posting list on page. This could just be a + * regular tuple. + */ + _bt_dedup_insert(newpage, dedupState); + } + + pfree(dedupState->itupprev); + dedupState->itupprev = CopyIndexTuple(itup); + } + + /* Handle the last item */ + _bt_dedup_insert(newpage, dedupState); + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buffer); + + /* Log full page write */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = log_newpage_buffer(buffer, true); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* be tidy */ + pfree(dedupState); +} + +/* + * Add new posting tuple item to the page based on itupprev and saved list of + * heap TIDs. + */ +static void +_bt_dedup_insert(Page page, BTDedupState *dedupState) +{ + IndexTuple to_insert; + OffsetNumber offnum = PageGetMaxOffsetNumber(page); + + if (dedupState->ntuples == 0) + { + /* + * Use original itupprev, which may or may not be a posting list + * already from some earlier dedup attempt + */ + to_insert = dedupState->itupprev; + } + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = BTreeFormPostingTuple(dedupState->itupprev, + dedupState->ipd, + dedupState->ntuples); + to_insert = postingtuple; + pfree(dedupState->ipd); + } + + Assert(IndexTupleSize(dedupState->itupprev) <= dedupState->maxitemsize); + /* Add the new item into the page */ + offnum = OffsetNumberNext(offnum); + + if (PageAddItem(page, (Item) to_insert, IndexTupleSize(to_insert), + offnum, false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add tuple to page"); + + if (dedupState->ntuples > 0) + pfree(to_insert); + dedupState->ntuples = 0; +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 268f869a36..5314bbe2a9 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/nbtxlog.h" +#include "access/tableam.h" #include "access/transam.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -42,6 +43,11 @@ static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, BlockNumber *target, BlockNumber *rightsib); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid); +static TransactionId _bt_compute_xid_horizon_for_tuples(Relation rel, + Relation heapRel, + Buffer buf, + OffsetNumber *itemnos, + int nitems); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image @@ -983,14 +989,52 @@ _bt_page_recyclable(Page page) void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *remainingoffset, + IndexTuple *remaining, int nremaining, BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; + Size itemsz; + Size remaining_sz = 0; + char *remaining_buf = NULL; + + /* XLOG stuff, buffer for remainings */ + if (nremaining && RelationNeedsWAL(rel)) + { + Size offset = 0; + + for (int i = 0; i < nremaining; i++) + remaining_sz += MAXALIGN(IndexTupleSize(remaining[i])); + + remaining_buf = palloc0(remaining_sz); + for (int i = 0; i < nremaining; i++) + { + itemsz = IndexTupleSize(remaining[i]); + memcpy(remaining_buf + offset, (char *) remaining[i], itemsz); + offset += MAXALIGN(itemsz); + } + Assert(offset == remaining_sz); + } /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); + /* Handle posting tuples here */ + for (int i = 0; i < nremaining; i++) + { + /* At first, delete the old tuple. */ + PageIndexTupleDelete(page, remainingoffset[i]); + + itemsz = IndexTupleSize(remaining[i]); + itemsz = MAXALIGN(itemsz); + + /* Add tuple with remaining ItemPointers to the page. */ + if (PageAddItem(page, (Item) remaining[i], itemsz, remainingoffset[i], + false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to rewrite posting list item in index while doing vacuum"); + } + /* Fix the page */ if (nitems > 0) PageIndexMultiDelete(page, itemnos, nitems); @@ -1020,6 +1064,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, xl_btree_vacuum xlrec_vacuum; xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + xlrec_vacuum.nremaining = nremaining; + xlrec_vacuum.ndeleted = nitems; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); @@ -1033,6 +1079,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, if (nitems > 0) XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber)); + /* + * Here we should save offnums and remaining tuples themselves. It's + * important to restore them in correct order. At first, we must + * handle remaining tuples and only after that other deleted items. + */ + if (nremaining > 0) + { + Assert(remaining_buf != NULL); + XLogRegisterBufData(0, (char *) remainingoffset, + nremaining * sizeof(OffsetNumber)); + XLogRegisterBufData(0, remaining_buf, remaining_sz); + } + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); PageSetLSN(page, recptr); @@ -1041,6 +1100,91 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, END_CRIT_SECTION(); } +/* + * Get the latestRemovedXid from the table entries pointed at by the index + * tuples being deleted. + * + * This is a version of index_compute_xid_horizon_for_tuples() specialized to + * nbtree, which can handle posting lists. + */ +static TransactionId +_bt_compute_xid_horizon_for_tuples(Relation rel, Relation heapRel, + Buffer buf, OffsetNumber *itemnos, + int nitems) +{ + ItemPointerData *ttids; + TransactionId latestRemovedXid = InvalidTransactionId; + Page page = BufferGetPage(buf); + int arraynitems; + int finalnitems; + + /* + * Initial size of array can fit everything when it turns out that are no + * posting lists + */ + arraynitems = nitems; + ttids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * arraynitems); + + finalnitems = 0; + /* identify what the index tuples about to be deleted point to */ + for (int i = 0; i < nitems; i++) + { + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, itemnos[i]); + itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(ItemIdIsDead(itemid)); + + if (!BTreeTupleIsPosting(itup)) + { + /* Make sure that we have space for additional heap TID */ + if (finalnitems + 1 > arraynitems) + { + arraynitems = arraynitems * 2; + ttids = (ItemPointerData *) + repalloc(ttids, sizeof(ItemPointerData) * arraynitems); + } + + Assert(ItemPointerIsValid(&itup->t_tid)); + ItemPointerCopy(&itup->t_tid, &ttids[finalnitems]); + finalnitems++; + } + else + { + int nposting = BTreeTupleGetNPosting(itup); + + /* Make sure that we have space for additional heap TIDs */ + if (finalnitems + nposting > arraynitems) + { + arraynitems = Max(arraynitems * 2, finalnitems + nposting); + ttids = (ItemPointerData *) + repalloc(ttids, sizeof(ItemPointerData) * arraynitems); + } + + for (int j = 0; j < nposting; j++) + { + ItemPointer htid = BTreeTupleGetPostingN(itup, j); + + Assert(ItemPointerIsValid(htid)); + ItemPointerCopy(htid, &ttids[finalnitems]); + finalnitems++; + } + } + } + + Assert(finalnitems >= nitems); + + /* determine the actual xid horizon */ + latestRemovedXid = + table_compute_xid_horizon_for_tuples(heapRel, ttids, finalnitems); + + pfree(ttids); + + return latestRemovedXid; +} + /* * Delete item(s) from a btree page during single-page cleanup. * @@ -1067,8 +1211,8 @@ _bt_delitems_delete(Relation rel, Buffer buf, if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = - index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - itemnos, nitems); + _bt_compute_xid_horizon_for_tuples(rel, heapRel, buf, + itemnos, nitems); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4cfd5289ad..67595319d7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static ItemPointer btreevacuumPosting(BTVacState *vstate, IndexTuple itup, + int *nremaining); /* @@ -263,8 +265,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) */ if (so->killedItems == NULL) so->killedItems = (int *) - palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) + palloc(MaxPostingIndexTuplesPerPage * sizeof(int)); + if (so->numKilled < MaxPostingIndexTuplesPerPage) so->killedItems[so->numKilled++] = so->currPos.itemIndex; } @@ -1069,7 +1071,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); - _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); + _bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0, + vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -1193,6 +1196,9 @@ restart: OffsetNumber offnum, minoff, maxoff; + IndexTuple remaining[MaxOffsetNumber]; + OffsetNumber remainingoffset[MaxOffsetNumber]; + int nremaining; /* * Trade in the initial read lock for a super-exclusive write lock on @@ -1229,6 +1235,7 @@ restart: * callback function. */ ndeletable = 0; + nremaining = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) @@ -1242,31 +1249,79 @@ restart: itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - htup = &(itup->t_tid); - /* - * During Hot Standby we currently assume that - * XLOG_BTREE_VACUUM records do not produce conflicts. That is - * only true as long as the callback function depends only - * upon whether the index tuple refers to heap tuples removed - * in the initial heap scan. When vacuum starts it derives a - * value of OldestXmin. Backends taking later snapshots could - * have a RecentGlobalXmin with a later xid than the vacuum's - * OldestXmin, so it is possible that row versions deleted - * after OldestXmin could be marked as killed by other - * backends. The callback function *could* look at the index - * tuple state in isolation and decide to delete the index - * tuple, though currently it does not. If it ever did, we - * would need to reconsider whether XLOG_BTREE_VACUUM records - * should cause conflicts. If they did cause conflicts they - * would be fairly harsh conflicts, since we haven't yet - * worked out a way to pass a useful value for - * latestRemovedXid on the XLOG_BTREE_VACUUM records. This - * applies to *any* type of index that marks index tuples as - * killed. - */ - if (callback(htup, callback_state)) - deletable[ndeletable++] = offnum; + if (BTreeTupleIsPosting(itup)) + { + int nnewipd = 0; + ItemPointer newipd = NULL; + + newipd = btreevacuumPosting(vstate, itup, &nnewipd); + + if (nnewipd == 0) + { + /* + * All TIDs from posting list must be deleted, we can + * delete whole tuple in a regular way. + */ + deletable[ndeletable++] = offnum; + } + else if (nnewipd == BTreeTupleGetNPosting(itup)) + { + /* + * All TIDs from posting tuple must remain. Do + * nothing, just cleanup. + */ + pfree(newipd); + } + else if (nnewipd < BTreeTupleGetNPosting(itup)) + { + /* Some TIDs from posting tuple must remain. */ + Assert(nnewipd > 0); + Assert(newipd != NULL); + + /* + * Form new tuple that contains only remaining TIDs. + * Remember this tuple and the offset of the old tuple + * to update it in place. + */ + remainingoffset[nremaining] = offnum; + remaining[nremaining] = + BTreeFormPostingTuple(itup, newipd, nnewipd); + nremaining++; + pfree(newipd); + + Assert(IndexTupleSize(itup) <= BTMaxItemSize(page)); + } + } + else + { + htup = &(itup->t_tid); + + /* + * During Hot Standby we currently assume that + * XLOG_BTREE_VACUUM records do not produce conflicts. + * That is only true as long as the callback function + * depends only upon whether the index tuple refers to + * heap tuples removed in the initial heap scan. When + * vacuum starts it derives a value of OldestXmin. + * Backends taking later snapshots could have a + * RecentGlobalXmin with a later xid than the vacuum's + * OldestXmin, so it is possible that row versions deleted + * after OldestXmin could be marked as killed by other + * backends. The callback function *could* look at the + * index tuple state in isolation and decide to delete the + * index tuple, though currently it does not. If it ever + * did, we would need to reconsider whether + * XLOG_BTREE_VACUUM records should cause conflicts. If + * they did cause conflicts they would be fairly harsh + * conflicts, since we haven't yet worked out a way to + * pass a useful value for latestRemovedXid on the + * XLOG_BTREE_VACUUM records. This applies to *any* type + * of index that marks index tuples as killed. + */ + if (callback(htup, callback_state)) + deletable[ndeletable++] = offnum; + } } } @@ -1274,7 +1329,7 @@ restart: * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ - if (ndeletable > 0) + if (ndeletable > 0 || nremaining > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes @@ -1291,6 +1346,7 @@ restart: * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, + remainingoffset, remaining, nremaining, vstate->lastBlockVacuumed); /* @@ -1375,6 +1431,41 @@ restart: } } +/* + * btreevacuumPosting() -- vacuums a posting tuple. + * + * Returns new palloc'd posting list with remaining items. + * Posting list size is returned via nremaining. + * + * If all items are dead, + * nremaining is 0 and resulting posting list is NULL. + */ +static ItemPointer +btreevacuumPosting(BTVacState *vstate, IndexTuple itup, int *nremaining) +{ + int remaining = 0; + int nitem = BTreeTupleGetNPosting(itup); + ItemPointer tmpitems = NULL, + items = BTreeTupleGetPosting(itup); + + /* + * Check each tuple in the posting list, save alive tuples into tmpitems + */ + for (int i = 0; i < nitem; i++) + { + if (vstate->callback(items + i, vstate->callback_state)) + continue; + + if (tmpitems == NULL) + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + + tmpitems[remaining++] = items[i]; + } + + *nremaining = remaining; + return tmpitems; +} + /* * btcanreturn() -- Check whether btree indexes support index-only scans. * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 8e512461a0..c78c8e67b5 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,10 +26,18 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static int _bt_binsrch_posting(BTScanInsert key, Page page, + OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); +static void _bt_setuppostingitems(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer iptr, + IndexTuple itup); +static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, + OffsetNumber offnum, ItemPointer iptr, + IndexTuple itup); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, @@ -434,7 +442,10 @@ _bt_binsrch(Relation rel, * low) makes bounds invalid. * * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time. + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's in_posting_offset field to + * determine which existing heap TID will need to be replaced by their + * scantid/new heap TID). */ OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate) @@ -453,6 +464,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) Assert(P_ISLEAF(opaque)); Assert(!key->nextkey); + Assert(insertstate->in_posting_offset == 0); if (!insertstate->bounds_valid) { @@ -509,6 +521,17 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) if (result != 0) stricthigh = high; } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set in_posting_offset for caller. Caller must + * split the posting list when in_posting_offset is set. This should + * happen infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + insertstate->in_posting_offset = + _bt_binsrch_posting(key, page, mid); } /* @@ -528,6 +551,68 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) return low; } +/*---------- + * _bt_binsrch_posting() -- posting list binary search. + * + * Returns offset into posting list where caller's scantid belongs. + *---------- + */ +static int +_bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) +{ + IndexTuple itup; + ItemId itemid; + int low, + high, + mid, + res; + + /* + * If this isn't a posting tuple, then the index must be corrupt (if it is + * an ordinary non-pivot tuple then there must be an existing tuple with a + * heap TID that equals inserter's new heap TID/scantid). Defensively + * check that tuple is a posting list tuple whose posting list range + * includes caller's scantid. + * + * (This is also needed because contrib/amcheck's rootdescend option needs + * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().) + */ + Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page))); + Assert(!key->nextkey); + Assert(key->scantid != NULL); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + if (!BTreeTupleIsPosting(itup)) + return 0; + + /* + * In the unlikely event that posting list tuple has LP_DEAD bit set, + * signal to caller that it should kill the item and restart its binary + * search. + */ + if (ItemIdIsDead(itemid)) + return -1; + + /* "high" is past end of posting list for loop invariant */ + low = 0; + high = BTreeTupleGetNPosting(itup); + Assert(high >= 2); + + while (high > low) + { + mid = low + ((high - low) / 2); + res = ItemPointerCompare(key->scantid, + BTreeTupleGetPostingN(itup, mid)); + + if (res >= 1) + low = mid + 1; + else + high = mid; + } + + return low; +} + /*---------- * _bt_compare() -- Compare insertion-type scankey to tuple on a page. * @@ -537,9 +622,18 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be - * returned to the caller as a matching key! + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * It is generally guaranteed that any possible scankey with scantid set + * will have zero or one tuples in the index that are considered equal + * here. * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the @@ -563,6 +657,7 @@ _bt_compare(Relation rel, ScanKey scankey; int ncmpkey; int ntupatts; + int32 result; Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); @@ -597,7 +692,6 @@ _bt_compare(Relation rel, { Datum datum; bool isNull; - int32 result; datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); @@ -713,8 +807,24 @@ _bt_compare(Relation rel, if (heapTid == NULL) return 1; + /* + * scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * as a simple scalar value. + */ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - return ItemPointerCompare(key->scantid, heapTid); + result = ItemPointerCompare(key->scantid, heapTid); + if (!BTreeTupleIsPosting(itup) || result <= 0) + return result; + else + { + result = ItemPointerCompare(key->scantid, BTreeTupleGetMaxTID(itup)); + if (result > 0) + return 1; + } + + return 0; } /* @@ -1451,6 +1561,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; + so->currPos.postingTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be @@ -1485,8 +1596,30 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; + if (!BTreeTupleIsPosting(itup)) + { + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + /* + * Setup state to return posting list, and save first + * "logical" tuple + */ + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Save additional posting list "logical" tuples */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + itup); + itemIndex++; + } + } } /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) @@ -1519,7 +1652,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (!continuescan) so->currPos.moreRight = false; - Assert(itemIndex <= MaxIndexTuplesPerPage); + Assert(itemIndex <= MaxPostingIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; @@ -1527,7 +1660,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) else { /* load items[] in descending order */ - itemIndex = MaxIndexTuplesPerPage; + itemIndex = MaxPostingIndexTuplesPerPage; offnum = Min(offnum, maxoff); @@ -1569,8 +1702,37 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions, so remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + if (!BTreeTupleIsPosting(itup)) + { + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int i = BTreeTupleGetNPosting(itup) - 1; + + /* + * Setup state to return posting list, and save last + * "logical" tuple from posting list (since it's the first + * that will be returned to scan). + */ + itemIndex--; + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i--), + itup); + + /* + * Return posting list "logical" tuples -- do this in + * descending order, to match overall scan order + */ + for (; i >= 0; i--) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + itup); + } + } } if (!continuescan) { @@ -1584,8 +1746,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1; + so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); @@ -1598,6 +1760,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + Assert(!BTreeTupleIsPosting(itup)); + currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -1610,6 +1774,61 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, } } +/* + * Setup state to save posting items from a single posting list tuple. Saves + * the logical tuple that will be returned to scan first in passing. + * + * Saves an index item into so->currPos.items[itemIndex] for logical tuple + * that is returned to scan first. Second or subsequent heap TID for posting + * list should be saved by calling _bt_savepostingitem(). + */ +static void +_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer iptr, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *iptr; + currItem->indexOffset = offnum; + + if (so->currTuples) + { + /* Save a truncated version of the IndexTuple */ + Size itupsz = BTreeTupleGetPostingOffset(itup); + + itupsz = MAXALIGN(itupsz); + currItem->tupleOffset = so->currPos.nextTupleOffset; + memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); + so->currPos.nextTupleOffset += itupsz; + so->currPos.postingTupleOffset = currItem->tupleOffset; + } +} + +/* + * Save an index item into so->currPos.items[itemIndex] for posting tuple. + * + * Assumes that _bt_setuppostingitems() has already been called for current + * posting list tuple. + */ +static inline void +_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, + ItemPointer iptr, IndexTuple itup) +{ + BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + + currItem->heapTid = *iptr; + currItem->indexOffset = offnum; + + if (so->currTuples) + { + /* + * Have index-only scans return the same truncated IndexTuple for + * every logical tuple that originates from the same posting list + */ + currItem->tupleOffset = so->currPos.postingTupleOffset; + } +} + /* * _bt_steppage() -- Step to next page containing valid data for scan * diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ab19692006..a2484f3e3b 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -288,6 +288,8 @@ static void _bt_sortaddtup(Page page, Size itemsize, static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); +static void _bt_buildadd_posting(BTWriteState *wstate, BTPageState *state, + BTDedupState *dedupState); static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2); static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, @@ -830,6 +832,8 @@ _bt_sortaddtup(Page page, * the high key is to be truncated, offset 1 is deleted, and we insert * the truncated high key at offset 1. * + * Note that itup may be a posting list tuple. + * * 'last' pointer indicates the last offset added to the page. *---------- */ @@ -963,6 +967,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * Overwrite the old item with new truncated high key directly. * oitup is already located at the physical beginning of tuple * space, so this should directly reuse the existing tuple space. + * + * If lastleft tuple was a posting tuple, we'll truncate its + * posting list in _bt_truncate as well. Note that it is also + * applicable only to leaf pages, since internal pages never + * contain posting tuples. */ ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); lastleft = (IndexTuple) PageGetItem(opage, ii); @@ -1002,6 +1011,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the minimum key for the new page. */ state->btps_minkey = CopyIndexTuple(oitup); + Assert(BTreeTupleIsPivot(state->btps_minkey)); /* * Set the sibling links for both pages. @@ -1043,6 +1053,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); /* _bt_sortaddtup() will perform full truncation later */ + BTreeTupleClearBtIsPosting(state->btps_minkey); BTreeTupleSetNAtts(state->btps_minkey, 0); } @@ -1127,6 +1138,112 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } +/* + * Add new tuple (posting or non-posting) to the page while building index. + */ +static void +_bt_buildadd_posting(BTWriteState *wstate, BTPageState *state, + BTDedupState *dedupState) +{ + IndexTuple to_insert; + + /* Return, if there is no tuple to insert */ + if (state == NULL) + return; + + if (dedupState->ntuples == 0) + to_insert = dedupState->itupprev; + else + { + IndexTuple postingtuple; + + /* form a tuple with a posting list */ + postingtuple = BTreeFormPostingTuple(dedupState->itupprev, + dedupState->ipd, + dedupState->ntuples); + to_insert = postingtuple; + pfree(dedupState->ipd); + } + + _bt_buildadd(wstate, state, to_insert); + + if (dedupState->ntuples > 0) + pfree(to_insert); + dedupState->ntuples = 0; +} + +/* + * Save item pointer(s) of itup to the posting list in dedupState. + * + * 'itup' is current tuple on page, which comes immediately after equal + * 'itupprev' tuple stashed in dedup state at the point we're called. + * + * Helper function for _bt_load() and _bt_dedup_one_page(), called when it + * becomes clear that pending itupprev item will be part of a new/pending + * posting list, or when a pending/new posting list will contain a new heap + * TID from itup. + * + * Note: caller is responsible for the BTMaxItemSize() check. + */ +void +_bt_stash_item_tid(BTDedupState *dedupState, IndexTuple itup) +{ + int nposting = 0; + + if (dedupState->ntuples == 0) + { + dedupState->ipd = palloc0(dedupState->maxitemsize); + + /* + * itupprev hasn't had its posting list TIDs copied into ipd yet (must + * have been first on page and/or in new posting list?). Do so now. + * + * This is delayed because it wasn't initially clear whether or not + * itupprev would be merged with the next tuple, or stay as-is. By + * now caller compared it against itup and found that it was equal, so + * we can go ahead and add its TIDs. + */ + if (!BTreeTupleIsPosting(dedupState->itupprev)) + { + memcpy(dedupState->ipd, dedupState->itupprev, + sizeof(ItemPointerData)); + dedupState->ntuples++; + } + else + { + /* if itupprev is posting, add all its TIDs to the posting list */ + nposting = BTreeTupleGetNPosting(dedupState->itupprev); + memcpy(dedupState->ipd, + BTreeTupleGetPosting(dedupState->itupprev), + sizeof(ItemPointerData) * nposting); + dedupState->ntuples += nposting; + } + } + + /* + * Add current tup to ipd for pending posting list for new version of + * page. + */ + if (!BTreeTupleIsPosting(itup)) + { + memcpy(dedupState->ipd + dedupState->ntuples, itup, + sizeof(ItemPointerData)); + dedupState->ntuples++; + } + else + { + /* + * if tuple is posting, add all its TIDs to the pending list that will + * become new posting list later on + */ + nposting = BTreeTupleGetNPosting(itup); + memcpy(dedupState->ipd + dedupState->ntuples, + BTreeTupleGetPosting(itup), + sizeof(ItemPointerData) * nposting); + dedupState->ntuples += nposting; + } +} + /* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. @@ -1141,9 +1258,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, - keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); + keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index), + natts = IndexRelationGetNumberOfAttributes(wstate->index); SortSupport sortKeys; int64 tuples_done = 0; + bool deduplicate = false; + BTDedupState *dedupState = NULL; + + /* + * Don't use deduplication for indexes with INCLUDEd columns and unique + * indexes + */ + deduplicate = (IndexRelationGetNumberOfKeyAttributes(wstate->index) == + IndexRelationGetNumberOfAttributes(wstate->index) && + !wstate->index->rd_index->indisunique); if (merge) { @@ -1257,19 +1385,88 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) } else { - /* merge is unnecessary */ - while ((itup = tuplesort_getindextuple(btspool->sortstate, - true)) != NULL) + if (!deduplicate) { - /* When we see first tuple, create first index page */ - if (state == NULL) - state = _bt_pagestate(wstate, 0); + /* merge is unnecessary */ + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); - _bt_buildadd(wstate, state, itup); + _bt_buildadd(wstate, state, itup); - /* Report progress */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, - ++tuples_done); + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + } + else + { + /* init deduplication state needed to build posting tuples */ + dedupState = (BTDedupState *) palloc0(sizeof(BTDedupState)); + dedupState->ipd = NULL; + dedupState->ntuples = 0; + dedupState->itupprev = NULL; + dedupState->maxitemsize = 0; + dedupState->maxpostingsize = 0; + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + dedupState->maxitemsize = BTMaxItemSize(state->btps_page); + } + + if (dedupState->itupprev != NULL) + { + int n_equal_atts = _bt_keep_natts_fast(wstate->index, + dedupState->itupprev, itup); + + if (n_equal_atts > natts) + { + /* + * Tuples are equal. Create or update posting. + * + * Else If posting is too big, insert it on page and + * continue. + */ + if ((dedupState->ntuples + 1) * sizeof(ItemPointerData) < + dedupState->maxpostingsize) + _bt_stash_item_tid(dedupState, itup); + else + _bt_buildadd_posting(wstate, state, dedupState); + } + else + { + /* + * Tuples are not equal. Insert itupprev into index. + * Save current tuple for the next iteration. + */ + _bt_buildadd_posting(wstate, state, dedupState); + } + } + + /* + * Save the tuple to compare it with the next one and maybe + * unite them into a posting tuple. + */ + if (dedupState->itupprev) + pfree(dedupState->itupprev); + dedupState->itupprev = CopyIndexTuple(itup); + + /* compute max size of posting list */ + dedupState->maxpostingsize = dedupState->maxitemsize - + IndexInfoFindDataOffset(dedupState->itupprev->t_info) - + MAXALIGN(IndexTupleSize(dedupState->itupprev)); + } + + /* Handle the last item */ + _bt_buildadd_posting(wstate, state, dedupState); } } diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 1c1029b6c4..54cecc85c5 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -183,6 +183,9 @@ _bt_findsplitloc(Relation rel, state.minfirstrightsz = SIZE_MAX; state.newitemoff = newitemoff; + /* newitem cannot be a posting list item */ + Assert(!BTreeTupleIsPosting(newitem)); + /* * maxsplits should never exceed maxoff because there will be at most as * many candidate split points as there are points _between_ tuples, once @@ -459,17 +462,52 @@ _bt_recsplitloc(FindSplitData *state, int16 leftfree, rightfree; Size firstrightitemsz; + Size postingsubhikey = 0; bool newitemisfirstonright; /* Is the new item going to be the first item on the right page? */ newitemisfirstonright = (firstoldonright == state->newitemoff && !newitemonleft); + /* + * FIXME: Accessing every single tuple like this adds cycles to cases that + * cannot possibly benefit (i.e. cases where we know that there cannot be + * posting lists). Maybe we should add a way to not bother when we are + * certain that this is the case. + * + * We could either have _bt_split() pass us a flag, or invent a page flag + * that indicates that the page might have posting lists, as an + * optimization. There is no shortage of btpo_flags bits for stuff like + * this. + */ if (newitemisfirstonright) + { firstrightitemsz = state->newitemsz; + + /* Calculate posting list overhead, if any */ + if (state->is_leaf && BTreeTupleIsPosting(state->newitem)) + postingsubhikey = IndexTupleSize(state->newitem) - + BTreeTupleGetPostingOffset(state->newitem); + } else + { firstrightitemsz = firstoldonrightsz; + /* Calculate posting list overhead, if any */ + if (state->is_leaf) + { + ItemId itemid; + IndexTuple newhighkey; + + itemid = PageGetItemId(state->page, firstoldonright); + newhighkey = (IndexTuple) PageGetItem(state->page, itemid); + + if (BTreeTupleIsPosting(newhighkey)) + postingsubhikey = IndexTupleSize(newhighkey) - + BTreeTupleGetPostingOffset(newhighkey); + } + } + /* Account for all the old tuples */ leftfree = state->leftspace - olddataitemstoleft; rightfree = state->rightspace - @@ -492,9 +530,13 @@ _bt_recsplitloc(FindSplitData *state, * adding a heap TID to the left half's new high key when splitting at the * leaf level. In practice the new high key will often be smaller and * will rarely be larger, but conservatively assume the worst case. + * Truncation always truncates away any posting list that appears in the + * first right tuple, though, so it's safe to subtract that overhead + * (while still conservatively assuming that truncation might have to add + * back a single heap TID using the pivot tuple heap TID representation). */ if (state->is_leaf) - leftfree -= (int16) (firstrightitemsz + + leftfree -= (int16) ((firstrightitemsz - postingsubhikey) + MAXALIGN(sizeof(ItemPointerData))); else leftfree -= (int16) firstrightitemsz; @@ -691,7 +733,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); tup = (IndexTuple) PageGetItem(state->page, itemid); /* Do cheaper test first */ - if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) + if (BTreeTupleIsPosting(tup) || + !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 4c7b2d0966..e3d7f4ff0e 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -97,8 +97,6 @@ _bt_mkscankey(Relation rel, IndexTuple itup) indoption = rel->rd_indoption; tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); - /* * We'll execute search using scan key constructed on key columns. * Truncated attributes and non-key attributes are omitted from the final @@ -110,9 +108,20 @@ _bt_mkscankey(Relation rel, IndexTuple itup) key->anynullkeys = false; /* initial assumption */ key->nextkey = false; key->pivotsearch = false; + key->scantid = NULL; key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + Assert(!itup || !BTreeTupleIsPosting(itup) || key->heapkeyspace); + + /* + * When caller passes a tuple with a heap TID, use it to set scantid. Note + * that this handles posting list tuples by setting scantid to the lowest + * heap TID in the posting list. + */ + if (itup && key->heapkeyspace) + key->scantid = BTreeTupleGetHeapTID(itup); + skey = key->scankeys; for (i = 0; i < indnkeyatts; i++) { @@ -1786,10 +1795,35 @@ _bt_killitems(IndexScanDesc scan) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); + bool killtuple = false; - if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + if (BTreeTupleIsPosting(ituple)) { - /* found the item */ + int pi = i + 1; + int nposting = BTreeTupleGetNPosting(ituple); + int j; + + for (j = 0; j < nposting; j++) + { + ItemPointer item = BTreeTupleGetPostingN(ituple, j); + + if (!ItemPointerEquals(item, &kitem->heapTid)) + break; /* out of posting list loop */ + + /* Read-ahead to later kitems */ + if (pi < numKilled) + kitem = &so->currPos.items[so->killedItems[pi++]]; + } + + if (j == nposting) + killtuple = true; + } + else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) + killtuple = true; + + if (killtuple) + { + /* found the item/all posting list items */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ @@ -2145,6 +2179,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pivot = index_truncate_tuple(itupdesc, firstright, keepnatts); + if (BTreeTupleIsPosting(firstright)) + { + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetNAtts(pivot, keepnatts); + if (keepnatts == natts) + { + /* + * index_truncate_tuple() just returned a copy of the + * original, so make sure that the size of the new pivot tuple + * doesn't have posting list overhead + */ + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + } + + Assert(!BTreeTupleIsPosting(pivot)); + /* * If there is a distinguishing key attribute within new pivot tuple, * there is no need to add an explicit heap TID attribute @@ -2161,6 +2213,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute to the new pivot tuple. */ Assert(natts != nkeyatts); + Assert(!BTreeTupleIsPosting(lastleft) && + !BTreeTupleIsPosting(firstright)); newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData)); tidpivot = palloc0(newsize); memcpy(tidpivot, pivot, IndexTupleSize(pivot)); @@ -2168,6 +2222,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, pfree(pivot); pivot = tidpivot; } + else if (BTreeTupleIsPosting(firstright)) + { + /* + * No truncation was possible, since key attributes are all equal. We + * can always truncate away a posting list, though. + * + * It's necessary to add a heap TID attribute to the new pivot tuple. + */ + newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); + pivot = palloc0(newsize); + memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright)); + + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= newsize; + BTreeTupleClearBtIsPosting(pivot); + BTreeTupleSetAltHeapTID(pivot); + } else { /* @@ -2175,7 +2247,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * It's necessary to add a heap TID attribute to the new pivot tuple. */ Assert(natts == nkeyatts); - newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData)); + newsize = MAXALIGN(IndexTupleSize(firstright)) + + MAXALIGN(sizeof(ItemPointerData)); pivot = palloc0(newsize); memcpy(pivot, firstright, IndexTupleSize(firstright)); } @@ -2193,6 +2266,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * nbtree (e.g., there is no pg_attribute entry). */ Assert(itup_key->heapkeyspace); + Assert(!BTreeTupleIsPosting(pivot)); pivot->t_info &= ~INDEX_SIZE_MASK; pivot->t_info |= newsize; @@ -2205,7 +2279,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ pivotheaptid = (ItemPointer) ((char *) pivot + newsize - sizeof(ItemPointerData)); - ItemPointerCopy(&lastleft->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetMaxTID(lastleft), pivotheaptid); /* * Lehman and Yao require that the downlink to the right page, which is to @@ -2216,9 +2290,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * tiebreaker. */ #ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0); - Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #else /* @@ -2231,7 +2308,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, * attribute values along with lastleft's heap TID value when lastleft's * TID happens to be greater than firstright's TID. */ - ItemPointerCopy(&firstright->t_tid, pivotheaptid); + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); /* * Pivot heap TID should never be fully equal to firstright. Note that @@ -2240,7 +2317,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, */ ItemPointerSetOffsetNumber(pivotheaptid, OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); #endif BTreeTupleSetNAtts(pivot, nkeyatts); @@ -2321,15 +2399,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal (once detoasted). Similarly, result may - * differ from the _bt_keep_natts result when either tuple has TOASTed datums, - * though this is barely possible in practice. + * unless they're bitwise equal after detoasting. * * These issues must be acceptable to callers, typically because they're only * concerned about making suffix truncation as effective as possible without * leaving excessive amounts of free space on either side of page split. * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts. + * + * When an index only uses opclasses where equality is "precise", this + * function is guaranteed to give the same result as _bt_keep_natts(). This + * makes it safe to use this function to determine whether or not two tuples + * can be folded together into a single posting tuple. Posting list + * deduplication cannot be used with nondeterministic collations for this + * reason. + * + * FIXME: Actually invent the needed "equality-is-precise" opclass + * infrastructure. See dedicated -hackers thread: + * + * https://postgr.es/m/CAH2-Wzn3Ee49Gmxb7V1VJ3-AC8fWn-Fr8pfWQebHe8rYRxt5OQ@mail.gmail.com */ int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) @@ -2354,8 +2442,38 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) if (isNull1 != isNull2) break; + /* + * XXX: The ideal outcome from the point of view of the posting list + * patch is that the definition of an opclass with "precise equality" + * becomes: "equality operator function must give exactly the same + * answer as datum_image_eq() would, provided that we aren't using a + * nondeterministic collation". (Nondeterministic collations are + * clearly not compatible with deduplication.) + * + * This will be a lot faster than actually using the authoritative + * insertion scankey in some cases. This approach also seems more + * elegant, since suffix truncation gets to follow exactly the same + * definition of "equal" as posting list deduplication -- there is a + * subtle interplay between deduplication and suffix truncation, and + * it would be nice to know for sure that they have exactly the same + * idea about what equality is. + * + * This ideal outcome still avoids problems with TOAST. We cannot + * repeat bugs like the amcheck bug that was fixed in bugfix commit + * eba775345d23d2c999bbb412ae658b6dab36e3e8. datum_image_eq() + * considers binary equality, though only _after_ each datum is + * decompressed. + * + * If this ideal solution isn't possible, then we can fall back on + * defining "precise equality" as: "type's output function must + * produce identical textual output for any two datums that compare + * equal when using a safe/equality-is-precise operator class (unless + * using a nondeterministic collation)". That would mean that we'd + * have to make deduplication call _bt_keep_natts() instead (or some + * other function that uses authoritative insertion scankey). + */ if (!isNull1 && - !datumIsEqual(datum1, datum2, att->attbyval, att->attlen)) + !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) break; keepnatts++; @@ -2407,22 +2525,30 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); tupnatts = BTreeTupleGetNAtts(itup, rel); + /* !heapkeyspace indexes do not support deduplication */ + if (!heapkeyspace && BTreeTupleIsPosting(itup)) + return false; + + /* INCLUDE indexes do not support deduplication */ + if (natts != nkeyatts && BTreeTupleIsPosting(itup)) + return false; + if (P_ISLEAF(opaque)) { if (offnum >= P_FIRSTDATAKEY(opaque)) { /* - * Non-pivot tuples currently never use alternative heap TID - * representation -- even those within heapkeyspace indexes + * Non-pivot tuple should never be explicitly marked as a pivot + * tuple */ - if ((itup->t_info & INDEX_ALT_TID_MASK) != 0) + if (BTreeTupleIsPivot(itup)) return false; /* * Leaf tuples that are not the page high key (non-pivot tuples) * should never be truncated. (Note that tupnatts must have been - * inferred, rather than coming from an explicit on-disk - * representation.) + * inferred, even with a posting list tuple, because only pivot + * tuples store tupnatts directly.) */ return tupnatts == natts; } @@ -2466,12 +2592,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * non-zero, or when there is no explicit representation and the * tuple is evidently not a pre-pg_upgrade tuple. * - * Prior to v11, downlinks always had P_HIKEY as their offset. Use - * that to decide if the tuple is a pre-v11 tuple. + * Prior to v11, downlinks always had P_HIKEY as their offset. + * Accept that as an alternative indication of a valid + * !heapkeyspace negative infinity tuple. */ return tupnatts == 0 || - ((itup->t_info & INDEX_ALT_TID_MASK) == 0 && - ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; } else { @@ -2497,7 +2623,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) * heapkeyspace index pivot tuples, regardless of whether or not there are * non-key attributes. */ - if ((itup->t_info & INDEX_ALT_TID_MASK) == 0) + if (!BTreeTupleIsPivot(itup)) + return false; + + /* Pivot tuple should not use posting list representation (redundant) */ + if (BTreeTupleIsPosting(itup)) return false; /* @@ -2567,11 +2697,87 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, BTMaxItemSizeNoHeapTid(page), RelationGetRelationName(rel)), errdetail("Index row references tuple (%u,%u) in relation \"%s\".", - ItemPointerGetBlockNumber(&newtup->t_tid), - ItemPointerGetOffsetNumber(&newtup->t_tid), + ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), + ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), RelationGetRelationName(heap)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(heap, RelationGetRelationName(rel)))); } + +/* + * Given a basic tuple that contains key datum and posting list, + * build a posting tuple. + * + * Basic tuple can be a posting tuple, but we only use key part of it, + * all ItemPointers must be passed via ipd. + * + * If nipd == 1 fallback to building a non-posting tuple. + * It is necessary to avoid storage overhead after posting tuple was vacuumed. + */ +IndexTuple +BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, int nipd) +{ + uint32 keysize, + newsize = 0; + IndexTuple itup; + + /* We only need key part of the tuple */ + if (BTreeTupleIsPosting(tuple)) + keysize = BTreeTupleGetPostingOffset(tuple); + else + keysize = IndexTupleSize(tuple); + + Assert(nipd > 0); + + /* Add space needed for posting list */ + if (nipd > 1) + newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nipd; + else + newsize = keysize; + + newsize = MAXALIGN(newsize); + itup = palloc0(newsize); + memcpy(itup, tuple, keysize); + itup->t_info &= ~INDEX_SIZE_MASK; + itup->t_info |= newsize; + + if (nipd > 1) + { + /* Form posting tuple, fill posting fields */ + + /* Set meta info about the posting list */ + itup->t_info |= INDEX_ALT_TID_MASK; + BTreeSetPostingMeta(itup, nipd, SHORTALIGN(keysize)); + + /* sort the list to preserve TID order invariant */ + qsort((void *) ipd, nipd, sizeof(ItemPointerData), + (int (*) (const void *, const void *)) ItemPointerCompare); + + /* Copy posting list into the posting tuple */ + memcpy(BTreeTupleGetPosting(itup), ipd, + sizeof(ItemPointerData) * nipd); + } + else + { + /* To finish building of a non-posting tuple, copy TID from ipd */ + itup->t_info &= ~INDEX_ALT_TID_MASK; + ItemPointerCopy(ipd, &itup->t_tid); + } + + return itup; +} + +/* + * Opposite of BTreeFormPostingTuple. + * returns regular tuple that contains the key, + * the tid of the new tuple is the nth tid of original tuple's posting list + * result tuple palloc'd in a caller's context. + */ +IndexTuple +BTreeGetNthTupleOfPosting(IndexTuple tuple, int n) +{ + Assert(BTreeTupleIsPosting(tuple)); + return BTreeFormPostingTuple(tuple, BTreeTupleGetPostingN(tuple, n), 1); +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index dd5315c1aa..d4d7c09ff0 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -178,12 +178,34 @@ btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) { Size datalen; char *datapos = XLogRecGetBlockData(record, 0, &datalen); + IndexTuple nposting = NULL; page = BufferGetPage(buffer); - if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, - false, false) == InvalidOffsetNumber) - elog(PANIC, "btree_xlog_insert: failed to add item"); + if (xlrec->postingsz > 0) + { + IndexTuple oposting; + + Assert(isleaf); + + /* oposting must be at offset before new item */ + oposting = (IndexTuple) PageGetItem(page, + PageGetItemId(page, OffsetNumberPrev(xlrec->offnum))); + if (PageAddItem(page, (Item) datapos, xlrec->postingsz, + xlrec->offnum, false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add item"); + nposting = (IndexTuple) (datapos + xlrec->postingsz); + + Assert(MAXALIGN(IndexTupleSize(oposting)) == + MAXALIGN(IndexTupleSize(nposting))); + memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); + } + else + { + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_insert: failed to add item"); + } PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -265,9 +287,11 @@ btree_xlog_split(bool onleft, XLogReaderState *record) BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; IndexTuple newitem = NULL, - left_hikey = NULL; + left_hikey = NULL, + nposting = NULL; Size newitemsz = 0, - left_hikeysz = 0; + left_hikeysz = 0, + npostingsz = 0; Page newlpage; OffsetNumber leftoff; @@ -281,6 +305,17 @@ btree_xlog_split(bool onleft, XLogReaderState *record) datalen -= newitemsz; } + if (xlrec->replacepostingoff) + { + Assert(xlrec->replacepostingoff == + OffsetNumberPrev(xlrec->newitemoff)); + + nposting = (IndexTuple) datapos; + npostingsz = MAXALIGN(IndexTupleSize(nposting)); + datapos += npostingsz; + datalen -= npostingsz; + } + /* Extract left hikey and its size (assuming 16-bit alignment) */ left_hikey = (IndexTuple) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); @@ -304,6 +339,15 @@ btree_xlog_split(bool onleft, XLogReaderState *record) Size itemsz; IndexTuple item; + if (off == xlrec->replacepostingoff) + { + if (PageAddItem(newlpage, (Item) nposting, npostingsz, + leftoff, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add new item to left page after split"); + leftoff = OffsetNumberNext(leftoff); + continue; + } + /* add the new item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { @@ -386,8 +430,8 @@ btree_xlog_vacuum(XLogReaderState *record) Buffer buffer; Page page; BTPageOpaque opaque; -#ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); +#ifdef UNUSED /* * This section of code is thought to be no longer needed, after analysis @@ -478,14 +522,34 @@ btree_xlog_vacuum(XLogReaderState *record) if (len > 0) { - OffsetNumber *unused; - OffsetNumber *unend; + if (xlrec->nremaining) + { + OffsetNumber *remainingoffset; + IndexTuple remaining; + Size itemsz; - unused = (OffsetNumber *) ptr; - unend = (OffsetNumber *) ((char *) ptr + len); + remainingoffset = (OffsetNumber *) + (ptr + xlrec->ndeleted * sizeof(OffsetNumber)); + remaining = (IndexTuple) ((char *) remainingoffset + + xlrec->nremaining * sizeof(OffsetNumber)); - if ((unend - unused) > 0) - PageIndexMultiDelete(page, unused, unend - unused); + /* Handle posting tuples */ + for (int i = 0; i < xlrec->nremaining; i++) + { + PageIndexTupleDelete(page, remainingoffset[i]); + + itemsz = MAXALIGN(IndexTupleSize(remaining)); + + if (PageAddItem(page, (Item) remaining, itemsz, remainingoffset[i], + false, false) == InvalidOffsetNumber) + elog(PANIC, "btree_xlog_vacuum: failed to add remaining item"); + + remaining = (IndexTuple) ((char *) remaining + itemsz); + } + } + + if (xlrec->ndeleted) + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); } /* diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index a14eb792ec..6f71b13199 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -30,7 +30,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_insert *xlrec = (xl_btree_insert *) rec; - appendStringInfo(buf, "off %u", xlrec->offnum); + appendStringInfo(buf, "off %u; postingsz %u", + xlrec->offnum, xlrec->postingsz); break; } case XLOG_BTREE_SPLIT_L: @@ -38,6 +39,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_split *xlrec = (xl_btree_split *) rec; + /* FIXME: even master doesn't have newitemoff */ appendStringInfo(buf, "level %u, firstright %d", xlrec->level, xlrec->firstright); break; @@ -46,8 +48,10 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "lastBlockVacuumed %u", - xlrec->lastBlockVacuumed); + appendStringInfo(buf, "lastBlockVacuumed %u; nremaining %u; ndeleted %u", + xlrec->lastBlockVacuumed, + xlrec->nremaining, + xlrec->ndeleted); break; } case XLOG_BTREE_DELETE: diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 52eafe6b00..3aa09744e0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -234,8 +234,7 @@ typedef struct BTMetaPageData * t_tid | t_info | key values | INCLUDE columns, if any * * t_tid points to the heap TID, which is a tiebreaker key column as of - * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never - * set for non-pivot tuples. + * BTREE_VERSION 4. * * All other types of index tuples ("pivot" tuples) only have key columns, * since pivot tuples only exist to represent how the key space is @@ -252,6 +251,38 @@ typedef struct BTMetaPageData * omitted rather than truncated, since its representation is different to * the non-pivot representation.) * + * Non-pivot posting tuple format: + * t_tid | t_info | key values | INCLUDE columns, if any | posting_list[] + * + * In order to store duplicated keys more effectively, we use special format + * of tuples - posting tuples. posting_list is an array of ItemPointerData. + * + * Deduplication never applies to unique indexes or indexes with INCLUDEd + * columns. + * + * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and + * BT_IS_POSTING flag in t_tid. + * These flags redefine the content of the posting tuple's tid: + * - t_tid.ip_blkid contains offset of the posting list. + * - t_tid offset field contains number of posting items this tuple contain + * + * The 12 least significant offset bits from t_tid are used to represent + * the number of posting items in posting tuples, leaving 4 status + * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for + * future use. + * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting + * tuples, which is constrainted by BTMaxItemSize. + + * If page contains so many duplicates, that they do not fit into one posting + * tuple (bounded by BTMaxItemSize and ), page may contain several posting + * tuples with the same key. + * Also page can contain both posting and non-posting tuples with the same key. + * Currently, posting tuples always contain at least two TIDs in the posting + * list. + * + * Posting tuples always have the same number of attributes as the index has + * generally. + * * Pivot tuple format: * * t_tid | t_info | key values | [heap TID] @@ -281,23 +312,118 @@ typedef struct BTMetaPageData * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any * number of columns/attributes <= INDEX_MAX_KEYS. + * BT_IS_POSTING bit must be unset for pivot tuples, since we use it + * to distinct posting tuples from pivot tuples. * * Note well: The macros that deal with the number of attributes in tuples - * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple, - * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot - * tuple (or must have the same number of attributes as the index has - * generally in the case of !heapkeyspace indexes). They will need to be - * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK - * for something else. + * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or + * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set + * must be a non-pivot tuple (or must have the same number of attributes as + * the index has generally in the case of !heapkeyspace indexes). */ #define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* Item pointer offset bits */ #define BT_RESERVED_OFFSET_MASK 0xF000 #define BT_N_KEYS_OFFSET_MASK 0x0FFF +#define BT_N_POSTING_OFFSET_MASK 0x0FFF #define BT_HEAP_TID_ATTR 0x1000 +#define BT_IS_POSTING 0x2000 -/* Get/set downlink block number */ +/* + * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples + * that can fit on one btree leaf page. + * + * Btree leaf pages may contain posting tuples, which store duplicates + * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then + * MaxIndexTuplesPerPage. + * + * Each leaf page must contain at least three items, so estimate it as + * if we have three posting tuples with minimal size keys. + */ +#define MaxPostingIndexTuplesPerPage \ + ((int) ((BLCKSZ - SizeOfPageHeaderData - \ + 3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \ + (sizeof(ItemPointerData))) + +/* + * Btree-private state needed to build posting tuples. + * ipd is a posting list - an array of ItemPointerData. + * + * Iterating over tuples during index build or applying deduplication to a + * single page, we remember a tuple in itupprev, then compare the next one + * with it. If tuples are equal, save their TIDs in the posting list. + * ntuples contains the size of the posting list. + * + * Use maxitemsize and maxpostingsize to ensure that resulting posting tuple + * will satisfy BTMaxItemSize. + */ +typedef struct BTDedupState +{ + Size maxitemsize; + Size maxpostingsize; + IndexTuple itupprev; + int ntuples; + ItemPointerData *ipd; +} BTDedupState; + +/* + * N.B.: BTreeTupleIsPivot() should only be used in code that deals with + * heapkeyspace indexes specifically. BTreeTupleIsPosting() works with all + * nbtree indexes, though. + */ +#define BTreeTupleIsPivot(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\ + ) +#define BTreeTupleIsPosting(itup) \ + ( \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\ + ) + +#define BTreeTupleClearBtIsPosting(itup) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \ + } while(0) + +#define BTreeTupleGetNPosting(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \ + ) +#define BTreeTupleSetNPosting(itup, n) \ + do { \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \ + Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \ + ItemPointerSetOffsetNumber(&(itup)->t_tid, \ + ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \ + } while(0) + +/* + * If tuple is posting, t_tid.ip_blkid contains offset of the posting list + */ +#define BTreeTupleGetPostingOffset(itup) \ + ( \ + AssertMacro(BTreeTupleIsPosting(itup)), \ + ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \ + ) +#define BTreeSetPostingMeta(itup, nposting, off) \ + do { \ + BTreeTupleSetNPosting(itup, nposting); \ + Assert(BTreeTupleIsPosting(itup)); \ + ItemPointerSetBlockNumber(&((itup)->t_tid), (off)); \ + } while(0) + +#define BTreeTupleGetPosting(itup) \ + (ItemPointer) ((char*) (itup) + BTreeTupleGetPostingOffset(itup)) +#define BTreeTupleGetPostingN(itup,n) \ + (BTreeTupleGetPosting(itup) + (n)) + +/* Get/set downlink block number */ #define BTreeInnerTupleGetDownLink(itup) \ ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) #define BTreeInnerTupleSetDownLink(itup, blkno) \ @@ -326,40 +452,73 @@ typedef struct BTMetaPageData */ #define BTreeTupleGetNAtts(itup, rel) \ ( \ - (itup)->t_info & INDEX_ALT_TID_MASK ? \ + ((itup)->t_info & INDEX_ALT_TID_MASK && \ + ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \ ( \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \ ) \ : \ IndexRelationGetNumberOfAttributes(rel) \ ) -#define BTreeTupleSetNAtts(itup, n) \ - do { \ - (itup)->t_info |= INDEX_ALT_TID_MASK; \ - ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \ - } while(0) + +static inline void +BTreeTupleSetNAtts(IndexTuple itup, int n) +{ + Assert(!BTreeTupleIsPosting(itup)); + itup->t_info |= INDEX_ALT_TID_MASK; + ItemPointerSetOffsetNumber(&itup->t_tid, n & BT_N_KEYS_OFFSET_MASK); +} /* - * Get tiebreaker heap TID attribute, if any. Macro works with both pivot - * and non-pivot tuples, despite differences in how heap TID is represented. + * Get tiebreaker heap TID attribute, if any. Works with both pivot and + * non-pivot tuples, despite differences in how heap TID is represented. + * + * This returns the first/lowest heap TID in the case of a posting list tuple. */ -#define BTreeTupleGetHeapTID(itup) \ - ( \ - (itup)->t_info & INDEX_ALT_TID_MASK && \ - (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \ - ( \ - (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \ - sizeof(ItemPointerData)) \ - ) \ - : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \ - ) +static inline ItemPointer +BTreeTupleGetHeapTID(IndexTuple itup) +{ + if (BTreeTupleIsPivot(itup)) + { + /* Pivot tuple heap TID representation? */ + if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & + BT_HEAP_TID_ATTR) != 0) + return (ItemPointer) ((char *) itup + IndexTupleSize(itup) - + sizeof(ItemPointerData)); + + /* Heap TID attribute was truncated */ + return NULL; + } + else if (BTreeTupleIsPosting(itup)) + return BTreeTupleGetPosting(itup); + + return &(itup->t_tid); +} + +/* + * Get maximum heap TID attribute, which could be the only TID in the case of + * a non-pivot tuple that does not have a posting list tuple. Works with + * non-pivot tuples only. + */ +static inline ItemPointer +BTreeTupleGetMaxTID(IndexTuple itup) +{ + Assert(!BTreeTupleIsPivot(itup)); + + if (BTreeTupleIsPosting(itup)) + return (ItemPointer) (BTreeTupleGetPosting(itup) + + (BTreeTupleGetNPosting(itup) - 1)); + + return &(itup->t_tid); +} + /* * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK - * representation (currently limited to pivot tuples) + * representation */ #define BTreeTupleSetAltHeapTID(itup) \ do { \ - Assert((itup)->t_info & INDEX_ALT_TID_MASK); \ + Assert(BTreeTupleIsPivot(itup)); \ ItemPointerSetOffsetNumber(&(itup)->t_tid, \ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \ } while(0) @@ -499,6 +658,13 @@ typedef struct BTInsertStateData /* Buffer containing leaf page we're likely to insert itup on */ Buffer buf; + /* + * if _bt_binsrch_insert() found the location inside existing posting + * list, save the position inside the list. This will be -1 in rare cases + * where the overlapping posting list is LP_DEAD. + */ + int in_posting_offset; + /* * Cache of bounds within the current buffer. Only used for insertions * where _bt_check_unique is called. See _bt_binsrch_insert and @@ -534,7 +700,9 @@ typedef BTInsertStateData *BTInsertState; * If we are doing an index-only scan, we save the entire IndexTuple for each * matched item, otherwise only its heap TID and offset. The IndexTuples go * into a separate workspace array; each BTScanPosItem stores its tuple's - * offset within that array. + * offset within that array. Posting list tuples store a version of the + * tuple that does not include the posting list, allowing the same key to be + * returned for each logical tuple associated with the posting list. */ typedef struct BTScanPosItem /* what we remember about each match */ @@ -563,9 +731,13 @@ typedef struct BTScanPosData /* * If we are doing an index-only scan, nextTupleOffset is the first free - * location in the associated tuple storage workspace. + * location in the associated tuple storage workspace. Posting list + * tuples need postingTupleOffset to store the current location of the + * tuple that is returned multiple times (once per heap TID in posting + * list). */ int nextTupleOffset; + int postingTupleOffset; /* * The items array is always ordered in index order (ie, increasing @@ -578,7 +750,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + BTScanPosItem items[MaxPostingIndexTuplesPerPage]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -762,6 +934,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, + OffsetNumber *remainingoffset, + IndexTuple *remaining, int nremaining, BlockNumber lastBlockVacuumed); extern int _bt_pagedel(Relation rel, Buffer buf); @@ -812,6 +986,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); +extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, + int nipd); +extern IndexTuple BTreeGetNthTupleOfPosting(IndexTuple tuple, int n); /* * prototypes for functions in nbtvalidate.c @@ -824,5 +1001,6 @@ extern bool btvalidate(Oid opclassoid); extern IndexBuildResult *btbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc); +extern void _bt_stash_item_tid(BTDedupState *dedupState, IndexTuple itup); #endif /* NBTREE_H */ diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index afa614da25..daa931377f 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -61,16 +61,26 @@ typedef struct xl_btree_metadata * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. * Note that INSERT_META implies it's not a leaf page. * - * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 0: original page (data contains the inserted tuple); + * if postingsz is not 0, data also contains 'nposting' - + * tuple to replace original. + * + * TODO probably it would be enough to keep just a flag to point + * out that data contains 'nposting' and compute its offset as + * we know it follows the tuple, but I am afraid that it will + * break alignment, will it? + * * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META * Backup Blk 2: xl_btree_metadata, if INSERT_META + * */ typedef struct xl_btree_insert { OffsetNumber offnum; + uint32 postingsz; } xl_btree_insert; -#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, postingsz) + sizeof(uint32)) /* * On insert with split, we save all the items going into the right sibling @@ -96,6 +106,12 @@ typedef struct xl_btree_insert * An IndexTuple representing the high key of the left page must follow with * either variant. * + * In case, split included insertion into the middle of the posting tuple, and + * thus required posting tuple replacement, it also contains 'nposting', + * which must replace original posting tuple at replaceitemoff offset. + * TODO further optimization is to add it to xlog only if it remains on the + * left page. + * * Backup Blk 1: new right page * * The right page's data portion contains the right page's tuples in the form @@ -113,9 +129,10 @@ typedef struct xl_btree_split uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ OffsetNumber newitemoff; /* new item's offset (if placed on left page) */ + OffsetNumber replacepostingoff; /* offset of the posting item to replace */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, replacepostingoff) + sizeof(OffsetNumber)) /* * This is what we need to know about delete of individual leaf index tuples. @@ -173,10 +190,19 @@ typedef struct xl_btree_vacuum { BlockNumber lastBlockVacuumed; - /* TARGET OFFSET NUMBERS FOLLOW */ + /* + * This field helps us to find beginning of the remaining tuples from + * postings which follow array of offset numbers. + */ + uint32 nremaining; + uint32 ndeleted; + + /* REMAINING OFFSET NUMBERS FOLLOW (nremaining values) */ + /* REMAINING TUPLES TO INSERT FOLLOW (if nremaining > 0) */ + /* TARGET OFFSET NUMBERS FOLLOW (if any) */ } xl_btree_vacuum; -#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) +#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber)) /* * This is what we need to know about marking an empty branch for deletion. diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index ec47a228ae..71a03e39d3 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -212,3 +212,24 @@ Memcheck:Cond fun:PyObject_Realloc } + +# Temporarily work around bug in datum_image_eq's handling of the cstring +# (typLen == -2) case. datumIsEqual() is not affected, but also doesn't handle +# TOAST'ed values correctly. +# +# FIXME: Remove both suppressions when bug is fixed on master branch +{ + temporary_workaround_1 + Memcheck:Addr1 + fun:bcmp + fun:datum_image_eq + fun:_bt_keep_natts_fast +} + +{ + temporary_workaround_8 + Memcheck:Addr8 + fun:bcmp + fun:datum_image_eq + fun:_bt_keep_natts_fast +} -- 2.17.1