From ed66dc19791033ee5c3a530c60e537d5f137699c Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 1 Oct 2018 16:48:08 -0700 Subject: [PATCH v12 4/7] Add "split after new tuple" optimization. Add additional heuristics to the algorithm for locating an optimal split location. New logic identifies localized monotonically increasing values. This can greatly help space utilization to split between two groups of localized monotonically increasing values. Without this patch, affected cases will reliably leave leaf pages no more than about 50% full. 50/50 page splits are only appropriate with a pattern of truly random insertions. The optimization is very similar to the long established fillfactor optimization used during rightmost page splits, where we usually leave the new left side of the split 90% full. Split-at-new-tuple page splits target essentially the same case. The splits targeted are those at the rightmost point of a localized grouping of values, rather than those at the rightmost point of the entire key space. This enhancement is very effective at avoiding index bloat when initial bulk INSERTs for the TPC-C benchmark are run, and throughout the TPC-C benchmark. Localized monotonically increasing insertion patterns are presumed to be fairly common in real-world applications. Note that even pre-pg_upgrade'd v3 indexes make use of this optimization. --- src/backend/access/nbtree/nbtsplitloc.c | 174 ++++++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 015591eb87..396737f4bd 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -61,6 +61,9 @@ static OffsetNumber _bt_dofindsplitloc(Relation rel, Page page, static int _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz); +static bool _bt_splitafternewitemoff(Relation rel, Page page, + int leaffillfactor, OffsetNumber newitemoff, + IndexTuple newitem, double *propfullonleft); static OffsetNumber _bt_bestsplitloc(Relation rel, Page page, FindSplitData *state, int perfectpenalty, @@ -71,6 +74,7 @@ static int _bt_perfect_penalty(Relation rel, Page page, SplitMode mode, IndexTuple newitem, SplitMode *secondmode); static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, IndexTuple newitem, SplitPoint *split, bool is_leaf); +static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); /* @@ -254,6 +258,12 @@ _bt_dofindsplitloc(Relation rel, state.propfullonleft = leaffillfactor / 100.0; state.is_weighted = true; } + else if (_bt_splitafternewitemoff(rel, page, leaffillfactor, newitemoff, + newitem, &state.propfullonleft)) + { + /* propfullonleft was set for us */ + state.is_weighted = true; + } else { /* propfullonleft won't be used, but be tidy */ @@ -555,6 +565,142 @@ _bt_checksplitloc(FindSplitData *state, return INT_MAX; } +/* + * Subroutine to determine whether or not the page should be split immediately + * after the would-be original page offset for the new/incoming tuple. When + * the optimization is applied, the new/incoming tuple becomes the last tuple + * on the new left page. (Actually, newitemoff > maxoff is usually the case + * that leads to applying the optimization in practice, so applying leaf + * fillfactor in the style of a rightmost leaf page split is the most common + * outcome.) + * + * This routine targets splits in composite indexes that consist of one or + * more leading columns that describe some grouping, plus a trailing column + * with ascending values. This pattern is prevalent in many real world + * applications. Consider the example of a composite index on (supplier_id, + * invoice_id), where there are a small, nearly-fixed number of suppliers, and + * invoice_id is an identifier assigned in ascending order (it doesn't matter + * whether or not suppliers are assigned invoice_id values from the same + * counter, or their own counter). Without this optimization, approximately + * 50% of space in leaf pages will be wasted by unweighted/50:50 page splits. + * With this optimization, space utilization will be close to that of a + * similar index where all tuple insertions modify the current rightmost leaf + * page in the index. + * + * This optimization may leave extra free space remaining on the rightmost + * page of a "most significant column" grouping of tuples if that grouping + * never ends up having future insertions that use the free space. Testing + * has shown the effect to be self-limiting; a future grouping that becomes + * the "nearest on the right" grouping of the affected grouping usually puts + * the extra free space to good use instead. + * + * Caller uses propfullonleft rather than using the new item offset directly + * because not all offsets will be deemed legal as split points. This also + * allows us to apply leaf fillfactor in the common case where the new + * insertion is after the last offset. + */ +static bool +_bt_splitafternewitemoff(Relation rel, Page page, int leaffillfactor, + OffsetNumber newitemoff, IndexTuple newitem, + double *propfullonleft) +{ + OffsetNumber maxoff; + int16 nkeyatts; + ItemId itemid; + IndexTuple tup; + Size tupspace; + Size hikeysize; + int keepnatts; + + Assert(!P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page))); + maxoff = PageGetMaxOffsetNumber(page); + + /* Proceed only when items on page look fairly short */ + if (maxoff < MaxIndexTuplesPerPage / 2) + return false; + + nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Single key indexes not considered here */ + if (nkeyatts == 1) + return false; + + /* Ascending insertion pattern never inferred when new item is first */ + if (newitemoff == P_FIRSTKEY) + return false; + + /* + * Avoid applying optimization when tuples are not all of uniform size, + * with the exception of the high key (existing high key may be smaller + * due to truncation). Surmise that page has equisized tuples when page + * layout is consistent with having maxoff-1 non-pivot tuples that are all + * the same size as the newly inserted tuple. + */ + tupspace = ((PageHeader) page)->pd_special - ((PageHeader) page)->pd_upper; + itemid = PageGetItemId(page, P_HIKEY); + hikeysize = ItemIdGetLength(itemid); + if (IndexTupleSize(newitem) * (maxoff - 1) != tupspace - hikeysize) + return false; + + /* + * At least the first attribute's value must be equal to the corresponding + * value in previous tuple to apply optimization. New item cannot be a + * duplicate, either. + */ + if (newitemoff > maxoff) + { + /* Try to infer ascending insertion pattern */ + itemid = PageGetItemId(page, maxoff); + tup = (IndexTuple) PageGetItem(page, itemid); + keepnatts = _bt_keep_natts_fast(rel, tup, newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *propfullonleft = (double) leaffillfactor / 100.0; + return true; + } + + return false; + } + + /* + * When item isn't last (or first) on page, try to infer ascending + * insertion pattern. We try to split at the precise point of the + * insertion here, rather than applying leaf fillfactor. + * + * "Low cardinality leading column, high cardinality suffix column" + * indexes with a random insertion pattern (e.g. an index on '(country_id, + * event_uuid)') may sometimes end up having the optimization applied + * instead of getting a 50:50 (unweighted) page split. This is + * suboptimal. + * + * We're willing to accept that outcome when an incoming/new tuple is to + * the right of all existing items on the page, since that's expected for + * about one half of 1% of all page splits that occur in the index's + * lifetime (assuming default BLCKSZ) with random insertions. More care + * must be taken here, where we consider splits involving the new item + * being inserted at neither edge of the page: we proceed only when new + * item's heap TID is "adjacent" to the heap TID of the existing tuple to + * the immediate left of the offset for the new item. Heap TID adjacency + * strongly suggests that the item just to the left was inserted very + * recently. + */ + itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff)); + tup = (IndexTuple) PageGetItem(page, itemid); + if (!_bt_adjacenthtid(&tup->t_tid, &newitem->t_tid)) + return false; + /* Check same conditions as rightmost item case, too */ + keepnatts = _bt_keep_natts_fast(rel, tup, newitem); + + if (keepnatts > 1 && keepnatts <= nkeyatts) + { + *propfullonleft = (double) newitemoff / (((double) maxoff + 1)); + return true; + } + + return false; +} + /* * Subroutine to find the "best" split point among an array of acceptable * candidate split points that split without there being an excessively high @@ -853,3 +999,31 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, Assert(lastleft != firstright); return _bt_keep_natts_fast(rel, lastleft, firstright); } + +/* + * Subroutine for determining if two heap TIDS are "adjacent". + * + * Adjacent means that the high TID is very likely to have been inserted into + * heap relation immediately after the low TID, probably by the same + * transaction. + */ +static bool +_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +{ + BlockNumber lowblk, + highblk; + + lowblk = ItemPointerGetBlockNumber(lowhtid); + highblk = ItemPointerGetBlockNumber(highhtid); + + /* Make optimistic assumption of adjacency when heap blocks match */ + if (lowblk == highblk) + return true; + + /* When heap block one up, second offset should be FirstOffsetNumber */ + if (lowblk + 1 == highblk && + ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) + return true; + + return false; +} -- 2.17.1