From 98197e834343b804308f681b7110444499c79eed Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 1 Oct 2018 16:48:08 -0700
Subject: [PATCH v8 4/6] Add split-at-new-tuple page split optimization.

Add additional heuristics to the algorithm for locating an optimal split
location.  New logic identifies localized monotonically increasing
values by recognizing adjacent heap TIDs.  Only non-rightmost pages are
affected, to preserve existing behavior.

This enhancement is new to version 6 of the patch series.  This
enhancement has been demonstrated to be very effective at avoiding index
bloat when initial bulk INSERTs for the TPC-C benchmark are run.
Evidently, the primary keys for all of the largest indexes in the TPC-C
schema are populated through localized, monotonically increasing values:

Master
======

order_line_pkey: 774 MB
stock_pkey: 181 MB
idx_customer_name: 107 MB
oorder_pkey: 78 MB
customer_pkey: 75 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 60 MB
new_order_pkey: 22 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Patch series, up to and including this commit
=============================================

order_line_pkey: 451 MB
stock_pkey: 114 MB
idx_customer_name: 105 MB
oorder_pkey: 45 MB
customer_pkey: 48 MB
oorder_o_w_id_o_d_id_o_c_id_o_id_key: 61 MB
new_order_pkey: 13 MB
item_pkey: 2216 kB
district_pkey: 40 kB
warehouse_pkey: 24 kB

Without this patch, but with all previous patches in the series, a much
more modest reduction in the volume of bloat occurs when the same test
case is run.  There is a reduction in the size of the largest index (the
order line primary key) of ~5% of its original size, whereas we see a
reduction of ~42% here.  (Note that the patch series generally has very
little advantage over master if the indexes are rebuilt via a REINDEX,
with or without this later commit.)

I (Peter Geoghegan) will provide reviewers with a convenient copy of
this test data if asked.  It comes from the oltpbench fair-use
implementation of TPC-C [1], but the same issue has independently been
observed with the BenchmarkSQL implementation of TPC-C [2].

Note that this commit also recognizes and prevents bloat with
monotonically *decreasing* tuple insertions (e.g., single-DESC-attribute
index on a date column).  Affected cases will typically leave their
index structure slightly smaller than an equivalent monotonically
increasing case would.

[1] http://oltpbenchmark.com
[2] https://www.commandprompt.com/blog/postgres_autovacuum_bloat_tpc-c
---
 src/backend/access/nbtree/nbtinsert.c | 186 +++++++++++++++++++++++++-
 1 file changed, 184 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 0e37b8b23a..778805d6c1 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -100,6 +100,8 @@ static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 static int _bt_checksplitloc(FindSplitData *state,
 				  OffsetNumber firstoldonright, bool newitemonleft,
 				  int dataitemstoleft, Size firstoldonrightsz);
+static bool _bt_dosplitatnewitem(Relation rel, Page page,
+					 OffsetNumber newitemoff, IndexTuple newitem);
 static OffsetNumber _bt_bestsplitloc(Relation rel, Page page,
 				 FindSplitData *state,
 				 int perfectpenalty,
@@ -110,6 +112,7 @@ static int  _bt_perfect_penalty(Relation rel, Page page, FindSplitData *state,
 				 SplitMode *secondmode);
 static int _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 				  IndexTuple newitem, SplitPoint *split, bool is_leaf);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 			 OffsetNumber itup_off);
 static bool _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_scankey,
@@ -1745,7 +1748,13 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
  * etc) we will end up with a tree whose pages are about fillfactor% full,
  * instead of the 50% full result that we'd get without this special case.
  * This is the same as nbtsort.c produces for a newly-created tree.  Note
- * that leaf and nonleaf pages use different fillfactors.
+ * that leaf and nonleaf pages use different fillfactors.  Note also that
+ * the fillfactor% is determined dynamically when _bt_dosplitatnewitem()
+ * indicates that there are localized monotonically increasing insertions,
+ * or monotonically decreasing (DESC order) insertions. (This can only
+ * happen with the default strategy, and should be thought of as a variant
+ * of the fillfactor% special case that is applied only when inserting into
+ * non-rightmost pages.)
  *
  * If called recursively in single value mode, we also try to arrange to
  * leave the left split page fillfactor% full, though we arrange to use a
@@ -1835,7 +1844,28 @@ _bt_findsplitloc(Relation rel,
 	state.is_weighted = P_RIGHTMOST(opaque);
 	if (state.is_leaf)
 	{
-		if (state.mode != SPLIT_SINGLE_VALUE)
+		/*
+		 * Consider split at new tuple optimization.  See
+		 * _bt_dosplitatnewitem() for an explanation.
+		 */
+		if (state.mode == SPLIT_DEFAULT && !P_RIGHTMOST(opaque) &&
+			_bt_dosplitatnewitem(rel, page, newitemoff, newitem))
+		{
+			/*
+			 * fillfactor% is dynamically set through interpolation of the
+			 * new/incoming tuple's offset position
+			 */
+			if (newitemoff > maxoff)
+				state.fillfactor = (double) BTREE_DEFAULT_FILLFACTOR / 100.0;
+			else if (newitemoff == P_FIRSTDATAKEY(opaque))
+				state.fillfactor = (double) BTREE_MIN_FILLFACTOR / 100.0;
+			else
+				state.fillfactor =
+					((double) newitemoff / (((double) maxoff + 1)));
+
+			state.is_weighted = true;
+		}
+		else if (state.mode != SPLIT_SINGLE_VALUE)
 		{
 			/* Only used on rightmost page */
 			state.fillfactor = RelationGetFillFactor(rel,
@@ -2174,6 +2204,126 @@ _bt_checksplitloc(FindSplitData *state,
 	return INT_MAX;
 }
 
+/*
+ * Subroutine to determine whether or not the page should be split at
+ * approximately the point that the new/incoming item would have been
+ * inserted.
+ *
+ * This routine infers two distinct cases in which splitting around the new
+ * item's insertion point is likely to lead to better space utilization over
+ * time:
+ *
+ * - Composite indexes that consist of one or more leading columns that
+ *   describe some grouping, plus a trailing, monotonically increasing
+ *   column.  If there happened to only be one grouping then the traditional
+ *   rightmost page split default fillfactor% would be used to good effect,
+ *   so it seems worth recognizing this case.  This usage pattern is
+ *   prevalent in the TPC-C benchmark, and is assumed to be common in real
+ *   world applications.
+ *
+ * - DESC-ordered insertions, including DESC-ordered single (non-heap-TID)
+ *   key attribute indexes.  We don't want the performance of explicitly
+ *   DESC-ordered indexes to be out of line with an equivalent ASC-ordered
+ *   index.  Also, there may be organic cases where items are continually
+ *   inserted in DESC order for an index with ASC sort order.
+ *
+ * Caller uses fillfactor% rather than using the new item offset directly
+ * because it allows suffix truncation to be applied using the usual
+ * criteria, which can still be helpful.  This approach is also more
+ * maintainable, since restrictions on split points can be handled in the
+ * usual way.
+ *
+ * Localized insert points are inferred here by observing that neighboring
+ * heap TIDs are "adjacent".  For example, if the new item has distinct key
+ * attribute values to the existing item that belongs to its immediate left,
+ * and the item to its left has a heap TID whose offset is exactly one less
+ * than the new item's offset, then caller is told to use its new-item-split
+ * strategy.  It isn't of much consequence if this routine incorrectly
+ * infers that an interesting case is taking place, provided that that
+ * doesn't happen very often.  In particular, it should not be possible to
+ * construct a test case where the routine consistently does the wrong
+ * thing.  Since heap TID "adjacency" is such a delicate condition, and
+ * since there is no reason to imagine that random insertions should ever
+ * consistent leave new tuples at the first or last position on the page
+ * when a split is triggered, that will never happen.
+ *
+ * Note that we avoid using the split-at-new fillfactor% when we'd have to
+ * append a heap TID during suffix truncation.  We also insist that there
+ * are no varwidth attributes or NULL attribute values in new item, since
+ * that invalidates interpolating from the new item offset.  Besides,
+ * varwidths generally imply the use of datatypes where ordered insertions
+ * are not a naturally occurring phenomenon.
+ */
+static bool
+_bt_dosplitatnewitem(Relation rel, Page page, OffsetNumber newitemoff,
+					 IndexTuple newitem)
+{
+	ItemId		itemid;
+	OffsetNumber maxoff;
+	BTPageOpaque opaque;
+	IndexTuple	tup;
+	int16		nkeyatts;
+
+	if (IndexTupleHasNulls(newitem) || IndexTupleHasVarwidths(newitem))
+		return false;
+
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/* Avoid optimization entirely on pages with large items */
+	if (maxoff <= 3)
+		return false;
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+
+	/*
+	 * When heap TIDs appear in DESC order, consider left-heavy split.
+	 *
+	 * Accept left-heavy split when new item, which will be inserted at first
+	 * data offset, has adjacent TID to extant item at that position.
+	 */
+	if (newitemoff == P_FIRSTDATAKEY(opaque))
+	{
+		itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
+		tup = (IndexTuple) PageGetItem(page, itemid);
+
+		return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) &&
+			_bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts;
+	}
+
+	/* Single key indexes only use DESC optimization */
+	if (nkeyatts == 1)
+		return false;
+
+	/*
+	 * When tuple heap TIDs appear in ASC order, consider right-heavy split,
+	 * even though this may not be the right-most page.
+	 *
+	 * Accept right-heavy split when new item, which belongs after any
+	 * existing page offset, has adjacent TID to extant item that's the last
+	 * on the page.
+	 */
+	if (newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(page, maxoff);
+		tup = (IndexTuple) PageGetItem(page, itemid);
+
+		return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) &&
+			_bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts;
+	}
+
+	/*
+	 * When new item is approximately in the middle of the page, look for
+	 * adjacency among new item, and extant item that belongs to the left of
+	 * the new item in the keyspace.
+	 */
+	itemid = PageGetItemId(page, OffsetNumberPrev(newitemoff));
+	tup = (IndexTuple) PageGetItem(page, itemid);
+
+	return _bt_adjacenthtid(&tup->t_tid, &newitem->t_tid) &&
+		_bt_leave_natts_fast(rel, tup, newitem) <= nkeyatts;
+}
+
 /*
  * Subroutine to find the "best" split point among an array of acceptable
  * candidate split points that split without there being an excessively high
@@ -2459,6 +2609,38 @@ _bt_split_penalty(Relation rel, Page page, OffsetNumber newitemoff,
 	return _bt_leave_natts_fast(rel, lastleft, firstright);
 }
 
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably by the same
+ * transaction, and probably not through heap_update().  This is not a
+ * commutative condition.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+	OffsetNumber lowoff,
+				highoff;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+	lowoff = ItemPointerGetOffsetNumber(lowhtid);
+	highoff = ItemPointerGetOffsetNumber(highhtid);
+
+	/* When heap blocks match, second offset should be one up */
+	if (lowblk == highblk && OffsetNumberNext(lowoff) == highoff)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk && highoff == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
+
 /*
  * _bt_insert_parent() -- Insert downlink into parent after a page split.
  *
-- 
2.17.1