From 9cb5f0d9f275ddd41b746f7636eb357d31095df4 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 1 Oct 2018 16:48:08 -0700
Subject: [PATCH v14 4/7] Add "split after new tuple" optimization.

Add additional heuristics to the algorithm for locating an optimal split
location.  New logic identifies localized monotonically increasing
values.  When this insertion pattern is detected, page splits split just
after the new item that provoked a page split (or apply leaf fillfactor
in the style of a rightmost page split).

Without this patch, affected cases will reliably leave leaf pages no
more than about 50% full, without future insertions ever making use of
the free space left behind.  50/50 page splits are only appropriate with
a pattern of truly random insertions, where the average space
utilization ends up at 65% - 70%.  This patch addresses the worst case
for space utilization, where leaf pages are unusually sparsely filled
despite the fact that there are never any dead tuples.

The optimization is very similar to the long established fillfactor
optimization used during rightmost page splits, where we usually leave
the new left side of the split 90% full.  Split-after-new-tuple page
splits target essentially the same case.  The splits targeted are those
at the rightmost point of a localized grouping of values, rather than
those at the rightmost point of the entire key space.  Localized
monotonically increasing insertion patterns are presumed to be fairly
common in real-world applications.

Note that even pre-pg_upgrade'd v3 indexes make use of this
optimization.
---
 src/backend/access/nbtree/nbtsplitloc.c | 227 +++++++++++++++++++++++-
 1 file changed, 224 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index cd24480634..7fed1c151e 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -69,6 +69,9 @@ static void _bt_recsplitloc(FindSplitData *state,
 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
 					bool usemult);
 static int	_bt_splitcmp(const void *arg1, const void *arg2);
+static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
 				 bool *newitemonleft);
 static int _bt_strategy(FindSplitData *state, SplitPoint *lowpage,
@@ -245,9 +248,10 @@ _bt_findsplitloc(Relation rel,
 	 * Start search for a split point among list of legal split points.  Give
 	 * primary consideration to equalizing available free space in each half
 	 * of the split initially (start with default strategy), while applying
-	 * rightmost where appropriate.  Either of the two other fallback
-	 * strategies may be required for cases with a large number of duplicates
-	 * around the original/space-optimal split point.
+	 * rightmost and split-after-new-item optimizations where appropriate.
+	 * Either of the two other fallback strategies may be required for cases
+	 * with a large number of duplicates around the original/space-optimal
+	 * split point.
 	 *
 	 * Default strategy gives some weight to suffix truncation in deciding a
 	 * split point on leaf pages.  It attempts to select a split point where a
@@ -269,6 +273,44 @@ _bt_findsplitloc(Relation rel,
 		usemult = true;
 		fillfactormult = leaffillfactor / 100.0;
 	}
+	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
+	{
+		/*
+		 * New item inserted at rightmost point among a localized grouping on
+		 * a leaf page -- apply "split after new item" optimization, either by
+		 * applying leaf fillfactor multiplier, or by choosing the exact split
+		 * point that leaves the new item as last on the left. (usemult is set
+		 * for us.)
+		 */
+		if (usemult)
+		{
+			/* fillfactormult should be set based on leaf fillfactor */
+			fillfactormult = leaffillfactor / 100.0;
+		}
+		else
+		{
+			/* find precise split point after newitemoff */
+			for (int i = 0; i < state.nsplits; i++)
+			{
+				SplitPoint *split = state.splits + i;
+
+				if (split->newitemonleft &&
+					newitemoff == split->firstoldonright)
+				{
+					pfree(state.splits);
+					*newitemonleft = true;
+					return newitemoff;
+				}
+			}
+
+			/*
+			 * Cannot legally split after newitemoff; proceed with split
+			 * without using fillfactor multiplier.  This is defensive, and
+			 * should never be needed in practice.
+			 */
+			fillfactormult = 0.50;
+		}
+	}
 	else
 	{
 		/* Other leaf page.  50:50 page split. */
@@ -512,6 +554,185 @@ _bt_splitcmp(const void *arg1, const void *arg2)
 	return 0;
 }
 
+/*
+ * Subroutine to determine whether or not the page should be split immediately
+ * after the would-be original page offset for the new/incoming tuple.  This
+ * is appropriate when there is a pattern of localized monotonically
+ * increasing insertions into a composite index, grouped by one or more
+ * leading attribute values.  This is prevalent in many real world
+ * applications.  Consider the example of a composite index on '(invoice_id,
+ * item_no)', where the item_no for each invoice is an identifier assigned in
+ * ascending order (invoice_id could itself be assigned in monotonically
+ * increasing order, but that shouldn't matter).  Without this optimization,
+ * approximately 50% of space in leaf pages will be wasted by 50:50/!usemult
+ * page splits.  With this optimization, space utilization will be close to
+ * that of a similar index where all tuple insertions modify the current
+ * rightmost leaf page in the index (i.e. typically 90% for leaf pages).
+ *
+ * When the optimization is applied, the new/incoming tuple becomes the last
+ * tuple on the new left page.  (Actually, newitemoff > maxoff cases often use
+ * this optimization within indexes where monotonically increasing insertions
+ * of each grouping come in multiple "bursts" over time, such as a composite
+ * index on '(supplier_id, invoice_id, item_no)'.  Caller applies leaf
+ * fillfactor in the style of a rightmost leaf page split when netitemoff is
+ * at or very near the end of the original page.)
+ *
+ * This optimization may leave extra free space remaining on the rightmost
+ * page of a "most significant column" grouping of tuples if that grouping
+ * never ends up having future insertions that use the free space.  That
+ * effect is self-limiting; a future grouping that becomes the "nearest on the
+ * right" grouping of the affected grouping usually puts the extra free space
+ * to good use.  In general, it's important to avoid a pattern of pathological
+ * page splits that consistently do the wrong thing.
+ *
+ * Caller uses optimization when routine returns true, though the exact action
+ * taken by caller varies.  Caller uses original leaf page fillfactor in
+ * standard way rather than using the new item offset directly when *usemult
+ * was also set to true here.  Otherwise, caller applies optimization by
+ * locating the legal split point that makes the new tuple the very last tuple
+ * on the left side of the split.
+ */
+static bool
+_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult)
+{
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	int			keepnatts;
+
+	Assert(!state->is_rightmost);
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+	/* Assume leaffillfactor will be used by caller for now */
+	*usemult = true;
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/* Ascending insertion pattern never inferred when new item is first */
+	if (state->newitemoff == P_FIRSTKEY)
+		return false;
+
+	/*
+	 * Avoid applying optimization when tuples are wider than a tuple
+	 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
+	 * int4/int32 attributes)
+	 */
+	if (state->newitemsz >
+		MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
+		sizeof(ItemIdData))
+		return false;
+
+	/*
+	 * Only apply optimization on pages with equisized tuples.  Surmise that
+	 * page has equisized tuples when page layout is consistent with having
+	 * maxoff-1 non-pivot tuples that are all the same size as the newly
+	 * inserted tuple (note that the possibly-truncated high key isn't counted
+	 * in olddataitemstotal).
+	 */
+	if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
+		return false;
+
+	/*
+	 * At least the first attribute's value must be equal to the corresponding
+	 * value in previous tuple to apply optimization.  New item cannot be a
+	 * duplicate, either.
+	 *
+	 * Handle case where new item is to the right of all items on the existing
+	 * page.  This is suggestive of monotonically increasing insertions in
+	 * itself, so the "heap TID adjacency" test is not applied here.
+	 * Concurrent insertions from backends associated with the same grouping
+	 * or sub-grouping should still have the optimization applied; if the
+	 * grouping is rather large, splits will consistently end up here.
+	 */
+	if (state->newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(state->page, maxoff);
+		tup = (IndexTuple) PageGetItem(state->page, itemid);
+		keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+			return true;
+
+		return false;
+	}
+
+	/*
+	 * When item isn't last (or first) on page, but is deemed suitable for the
+	 * optimization, caller splits at the point immediately after the would-be
+	 * position of the new item, and immediately before the item after the new
+	 * item.
+	 *
+	 * "Low cardinality leading column, high cardinality suffix column"
+	 * indexes with a random insertion pattern (e.g. an index with a boolean
+	 * column, such as an index on '(book_is_in_print, book_isbn)') present us
+	 * with a risk of consistently misapplying the optimization.  We're
+	 * willing to accept very occasional misapplication of the optimization,
+	 * provided the cases where we get it wrong are rare and self-limiting.
+	 * Heap TID adjacency strongly suggests that the item just to the left was
+	 * inserted very recently, which prevents most misfirings.  Besides, all
+	 * inappropriate cases triggered at this point will still split in the
+	 * middle of the page on average.
+	 */
+	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
+	tup = (IndexTuple) PageGetItem(state->page, itemid);
+	/* Do cheaper test first */
+	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+		return false;
+	/* Check same conditions as rightmost item case, too */
+	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+	/*
+	 * Don't allow caller to split after a new item when it will result in a
+	 * split point to the right of the point that a leaf fillfactor split
+	 * would use -- have caller apply leaf fillfactor instead.  There is no
+	 * advantage to being very aggressive in any case.  It may not be legal to
+	 * split very close to maxoff.
+	 */
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		double		interp = (double) state->newitemoff / ((double) maxoff + 1);
+		double		leaffillfactormult = (double) leaffillfactor / 100.0;
+
+		if (interp <= leaffillfactormult)
+			*usemult = false;
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably by the same
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
+
 /*
  * Subroutine to find the "best" split point among an array of acceptable
  * candidate split points that split without there being an excessively high
-- 
2.17.1