From 98197e834343b804308f681b7110444499c79eed Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 1 Oct 2018 16:48:08 -0700 Subject: [PATCH v8 4/6] Add split-at-new-tuple page split optimization. Add additional heuristics to the algorithm for locating an optimal split location. New logic identifies localized monotonically increasing values by recognizing adjacent heap TIDs. Only non-rightmost pages are affected, to preserve existing behavior. This enhancement is new to version 6 of the patch series. This enhancement has been demonstrated to be very effective at avoiding index bloat when initial bulk INSERTs for the TPC-C benchmark are run. Evidently, the primary keys for all of the largest indexes in the TPC-C schema are populated through localized, monotonically increasing values: Master ====== order_line_pkey: 774 MB stock_pkey: 181 MB idx_customer_name: 107 MB oorder_pkey: 78 MB customer_pkey: 75 MB oorder_o_w_id_o_d_id_o_c_id_o_id_key: 60 MB new_order_pkey: 22 MB item_pkey: 2216 kB district_pkey: 40 kB warehouse_pkey: 24 kB Patch series, up to and including this commit ============================================= order_line_pkey: 451 MB stock_pkey: 114 MB idx_customer_name: 105 MB oorder_pkey: 45 MB customer_pkey: 48 MB oorder_o_w_id_o_d_id_o_c_id_o_id_key: 61 MB new_order_pkey: 13 MB item_pkey: 2216 kB district_pkey: 40 kB warehouse_pkey: 24 kB Without this patch, but with all previous patches in the series, a much more modest reduction in the volume of bloat occurs when the same test case is run. There is a reduction in the size of the largest index (the order line primary key) of ~5% of its original size, whereas we see a reduction of ~42% here. (Note that the patch series generally has very little advantage over master if the indexes are rebuilt via a REINDEX, with or without this later commit.) I (Peter Geoghegan) will provide reviewers with a convenient copy of this test data if asked. It comes from the oltpbench fair-use implementation of TPC-C [1], but the same issue has independently been observed with the BenchmarkSQL implementation of TPC-C [2]. Note that this commit also recognizes and prevents bloat with monotonically *decreasing* tuple insertions (e.g., single-DESC-attribute index on a date column). Affected cases will typically leave their index structure slightly smaller than an equivalent monotonically increasing case would. [1] http://oltpbenchmark.com [2] https://www.commandprompt.com/blog/postgres_autovacuum_bloat_tpc-c --- src/backend/access/nbtree/nbtinsert.c | 186 +++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 2 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 0e37b8b23a..778805d6c1 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -100,6 +100,8 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page, static int _bt_checksplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int dataitemstoleft, Size firstoldonrightsz); +static bool _bt_dosplitatnewitem(Relation rel, Page page, + OffsetNumber newitemoff, IndexTuple newitem); static OffsetNumber _bt_bestsplitloc(Relation rel, Page page, FindSplitData *state, int perfectpenalty, @@ -110,6 +112,7 @@ static int _bt_perfect_penalty(Relation rel, Page page, FindSplitData *state, SplitMode *secondmode); static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, IndexTuple newitem, SplitPoint *split, bool is_leaf); +static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off); static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_scankey, @@ -1745,7 +1748,13 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * etc) we will end up with a tree whose pages are about fillfactor% full, * instead of the 50% full result that we'd get without this special case. * This is the same as nbtsort.c produces for a newly-created tree. Note - * that leaf and nonleaf pages use different fillfactors. + * that leaf and nonleaf pages use different fillfactors. Note also that + * the fillfactor% is determined dynamically when _bt_dosplitatnewitem() + * indicates that there are localized monotonically increasing insertions, + * or monotonically decreasing (DESC order) insertions. (This can only + * happen with the default strategy, and should be thought of as a variant + * of the fillfactor% special case that is applied only when inserting into + * non-rightmost pages.) * * If called recursively in single value mode, we also try to arrange to * leave the left split page fillfactor% full, though we arrange to use a @@ -1835,7 +1844,28 @@ _bt_findsplitloc(Relation rel, state.is_weighted = P_RIGHTMOST(opaque); if (state.is_leaf) { - if (state.mode != SPLIT_SINGLE_VALUE) + /* + * Consider split at new tuple optimization. See + * _bt_dosplitatnewitem() for an explanation. + */ + if (state.mode == SPLIT_DEFAULT && !P_RIGHTMOST(opaque) && + _bt_dosplitatnewitem(rel, page, newitemoff, newitem)) + { + /* + * fillfactor% is dynamically set through interpolation of the + * new/incoming tuple's offset position + */ + if (newitemoff > maxoff) + state.fillfactor = (double) BTREE_DEFAULT_FILLFACTOR / 100.0; + else if (newitemoff == P_FIRSTDATAKEY(opaque)) + state.fillfactor = (double) BTREE_MIN_FILLFACTOR / 100.0; + else + state.fillfactor = + ((double) newitemoff / (((double) maxoff + 1))); + + state.is_weighted = true; + } + else if (state.mode != SPLIT_SINGLE_VALUE) { /* Only used on rightmost page */ state.fillfactor = RelationGetFillFactor(rel, @@ -2174,6 +2204,126 @@ _bt_checksplitloc(FindSplitData *state, return INT_MAX; } +/* + * Subroutine to determine whether or not the page should be split at + * approximately the point that the new/incoming item would have been + * inserted. + * + * This routine infers two distinct cases in which splitting around the new + * item's insertion point is likely to lead to better space utilization over + * time: + * + * - Composite indexes that consist of one or more leading columns that + * describe some grouping, plus a trailing, monotonically increasing + * column. If there happened to only be one grouping then the traditional + * rightmost page split default fillfactor% would be used to good effect, + * so it seems worth recognizing this case. This usage pattern is + * prevalent in the TPC-C benchmark, and is assumed to be common in real + * world applications. + * + * - DESC-ordered insertions, including DESC-ordered single (non-heap-TID) + * key attribute indexes. We don't want the performance of explicitly + * DESC-ordered indexes to be out of line with an equivalent ASC-ordered + * index. Also, there may be organic cases where items are continually + * inserted in DESC order for an index with ASC sort order. + * + * Caller uses fillfactor% rather than using the new item offset directly + * because it allows suffix truncation to be applied using the usual + * criteria, which can still be helpful. This approach is also more + * maintainable, since restrictions on split points can be handled in the + * usual way. + * + * Localized insert points are inferred here by observing that neighboring + * heap TIDs are "adjacent". For example, if the new item has distinct key + * attribute values to the existing item that belongs to its immediate left, + * and the item to its left has a heap TID whose offset is exactly one less + * than the new item's offset, then caller is told to use its new-item-split + * strategy. It isn't of much consequence if this routine incorrectly + * infers that an interesting case is taking place, provided that that + * doesn't happen very often. In particular, it should not be possible to + * construct a test case where the routine consistently does the wrong + * thing. Since heap TID "adjacency" is such a delicate condition, and + * since there is no reason to imagine that random insertions should ever + * consistent leave new tuples at the first or last position on the page + * when a split is triggered, that will never happen. + * + * Note that we avoid using the split-at-new fillfactor% when we'd have to + * append a heap TID during suffix truncation. We also insist that there + * are no varwidth attributes or NULL attribute values in new item, since + * that invalidates interpolating from the new item offset. Besides, + * varwidths generally imply the use of datatypes where ordered insertions + * are not a naturally occurring phenomenon. + */ +static bool +_bt_dosplitatnewitem(Relation rel, Page page, OffsetNumber newitemoff, + IndexTuple newitem) +{ + ItemId itemid; + OffsetNumber maxoff; + BTPageOpaque opaque; + IndexTuple tup; + int16 nkeyatts; + + if (IndexTupleHasNulls(newitem) || IndexTupleHasVarwidths(newitem)) + return false; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* Avoid optimization entirely on pages with large items */ + if (maxoff <= 3) + return false; + + nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* + * When heap TIDs appear in DESC order, consider left-heavy split. + * + * Accept left-heavy split when new item, which will be inserted at first + * data offset, has adjacent TID to extant item at that position. + */ + if (newitemoff == P_FIRSTDATAKEY(opaque)) + { + itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); + tup = (IndexTuple) PageGetItem(page, itemid); + + return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) && + _bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts; + } + + /* Single key indexes only use DESC optimization */ + if (nkeyatts == 1) + return false; + + /* + * When tuple heap TIDs appear in ASC order, consider right-heavy split, + * even though this may not be the right-most page. + * + * Accept right-heavy split when new item, which belongs after any + * existing page offset, has adjacent TID to extant item that's the last + * on the page. + */ + if (newitemoff > maxoff) + { + itemid = PageGetItemId(page, maxoff); + tup = (IndexTuple) PageGetItem(page, itemid); + + return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) && + _bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts; + } + + /* + * When new item is approximately in the middle of the page, look for + * adjacency among new item, and extant item that belongs to the left of + * the new item in the keyspace. + */ + itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff)); + tup = (IndexTuple) PageGetItem(page, itemid); + + return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) && + _bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts; +} + /* * Subroutine to find the "best" split point among an array of acceptable * candidate split points that split without there being an excessively high @@ -2459,6 +2609,38 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff, return _bt_leave_natts_fast(rel, lastleft, firstright); } +/* + * Subroutine for determining if two heap TIDS are "adjacent". + * + * Adjacent means that the high TID is very likely to have been inserted into + * heap relation immediately after the low TID, probably by the same + * transaction, and probably not through heap_update(). This is not a + * commutative condition. + */ +static bool +_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) +{ + BlockNumber lowblk, + highblk; + OffsetNumber lowoff, + highoff; + + lowblk = ItemPointerGetBlockNumber(lowhtid); + highblk = ItemPointerGetBlockNumber(highhtid); + lowoff = ItemPointerGetOffsetNumber(lowhtid); + highoff = ItemPointerGetOffsetNumber(highhtid); + + /* When heap blocks match, second offset should be one up */ + if (lowblk == highblk && OffsetNumberNext(lowoff) == highoff) + return true; + + /* When heap block one up, second offset should be FirstOffsetNumber */ + if (lowblk + 1 == highblk && highoff == FirstOffsetNumber) + return true; + + return false; +} + /* * _bt_insert_parent() -- Insert downlink into parent after a page split. * -- 2.17.1