From d41c1da841e4ab6245caff02d17b945f8346b47b Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 16 Apr 2024 13:21:36 -0400 Subject: [PATCH v2] Add skip scan to nbtree. Skip scan allows nbtree index scans to efficiently use a composite index on an index (a, b) for queries with a predicate such as "WHERE b = 5". This is useful in cases where the total number of distinct values in the column 'a' is reasonably small (think hundreds, possibly thousands). In effect, a skip scan treats the composite index on (a, b) as if it was a series of disjunct subindexes -- one subindex per distinct 'a' value. We exhaustively "search every subindex" using a qual that behaves like "WHERE a = ANY() AND b = 5". --- src/include/access/nbtree.h | 16 +- src/include/catalog/pg_amproc.dat | 16 + src/include/catalog/pg_proc.dat | 24 + src/include/utils/skipsupport.h | 140 ++ src/backend/access/nbtree/nbtcompare.c | 201 +++ src/backend/access/nbtree/nbtree.c | 10 +- src/backend/access/nbtree/nbtutils.c | 1399 ++++++++++++++++--- src/backend/access/nbtree/nbtvalidate.c | 4 + src/backend/commands/opclasscmds.c | 25 + src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/date.c | 34 + src/backend/utils/adt/meson.build | 1 + src/backend/utils/adt/selfuncs.c | 30 +- src/backend/utils/adt/skipsupport.c | 54 + src/backend/utils/adt/uuid.c | 65 + src/backend/utils/misc/guc_tables.c | 12 + doc/src/sgml/btree.sgml | 13 + doc/src/sgml/xindex.sgml | 16 +- src/test/regress/expected/alter_generic.out | 6 +- src/test/regress/expected/psql.out | 3 +- src/test/regress/sql/alter_generic.sql | 2 +- src/tools/pgindent/typedefs.list | 3 + 22 files changed, 1899 insertions(+), 176 deletions(-) create mode 100644 src/include/utils/skipsupport.h create mode 100644 src/backend/utils/adt/skipsupport.c diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 749304334..81e99fcc1 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,7 @@ #include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/shm_toc.h" +#include "utils/skipsupport.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -709,7 +710,8 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup) #define BTINRANGE_PROC 3 #define BTEQUALIMAGE_PROC 4 #define BTOPTIONS_PROC 5 -#define BTNProcs 5 +#define BTSKIPSUPPORT_PROC 6 +#define BTNProcs 6 /* * We need to be able to tell the difference between read and write @@ -1032,9 +1034,15 @@ typedef BTScanPosData *BTScanPos; typedef struct BTArrayKeyInfo { int scan_key; /* index of associated key in keyData */ + int num_elems; /* number of elems (-1 for skip array) */ + + /* State used by standard arrays that store elements in memory */ int cur_elem; /* index of current element in elem_values */ - int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ + + /* State used by skip arrays, which generate elements procedurally */ + SkipSupportData sksup; /* opclass skip scan support */ + bool null_elem; /* lowest/highest element actually NULL? */ } BTArrayKeyInfo; typedef struct BTScanOpaqueData @@ -1123,6 +1131,7 @@ typedef struct BTReadPageState */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_SKIP 0x00040000 /* SK_SEARCHARRAY skip scan key */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1159,6 +1168,9 @@ typedef struct BTOptions #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 +/* GUC parameter (just a temporary convenience for reviewers) */ +extern PGDLLIMPORT int skipscan_prefix_cols; + /* * external entry points for btree, in nbtree.c */ diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index f639c3a6a..2a8f6f3f1 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -21,6 +21,8 @@ amprocrighttype => 'bit', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '1', amproc => 'btboolcmp' }, +{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', + amprocrighttype => 'bool', amprocnum => '6', amproc => 'btboolskipsupport' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bpchar_ops', amproclefttype => 'bpchar', @@ -41,12 +43,16 @@ amprocrighttype => 'char', amprocnum => '1', amproc => 'btcharcmp' }, { amprocfamily => 'btree/char_ops', amproclefttype => 'char', amprocrighttype => 'char', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/char_ops', amproclefttype => 'char', + amprocrighttype => 'char', amprocnum => '6', amproc => 'btcharskipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '1', amproc => 'date_cmp' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '2', amproc => 'date_sortsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', + amprocrighttype => 'date', amprocnum => '6', amproc => 'date_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'timestamp', amprocnum => '1', amproc => 'date_cmp_timestamp' }, @@ -122,6 +128,8 @@ amprocrighttype => 'int2', amprocnum => '2', amproc => 'btint2sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', + amprocrighttype => 'int2', amprocnum => '6', amproc => 'btint2skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint24cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', @@ -141,6 +149,8 @@ amprocrighttype => 'int4', amprocnum => '2', amproc => 'btint4sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', + amprocrighttype => 'int4', amprocnum => '6', amproc => 'btint4skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int8', amprocnum => '1', amproc => 'btint48cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', @@ -160,6 +170,8 @@ amprocrighttype => 'int8', amprocnum => '2', amproc => 'btint8sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int8', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', + amprocrighttype => 'int8', amprocnum => '6', amproc => 'btint8skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint84cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', @@ -193,6 +205,8 @@ amprocrighttype => 'oid', amprocnum => '2', amproc => 'btoidsortsupport' }, { amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', amprocrighttype => 'oid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', + amprocrighttype => 'oid', amprocnum => '6', amproc => 'btoidskipsupport' }, { amprocfamily => 'btree/oidvector_ops', amproclefttype => 'oidvector', amprocrighttype => 'oidvector', amprocnum => '1', amproc => 'btoidvectorcmp' }, @@ -261,6 +275,8 @@ amprocrighttype => 'uuid', amprocnum => '2', amproc => 'uuid_sortsupport' }, { amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', amprocrighttype => 'uuid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', + amprocrighttype => 'uuid', amprocnum => '6', amproc => 'uuid_skipsupport' }, { amprocfamily => 'btree/record_ops', amproclefttype => 'record', amprocrighttype => 'record', amprocnum => '1', amproc => 'btrecordcmp' }, { amprocfamily => 'btree/record_image_ops', amproclefttype => 'record', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index d4ac578ae..d02dd1a0c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1004,18 +1004,27 @@ { oid => '3129', descr => 'sort support', proname => 'btint2sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2sortsupport' }, +{ oid => '9290', descr => 'skip support', + proname => 'btint2skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint2skipsupport' }, { oid => '351', descr => 'less-equal-greater', proname => 'btint4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int4 int4', prosrc => 'btint4cmp' }, { oid => '3130', descr => 'sort support', proname => 'btint4sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4sortsupport' }, +{ oid => '9291', descr => 'skip support', + proname => 'btint4skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint4skipsupport' }, { oid => '842', descr => 'less-equal-greater', proname => 'btint8cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int8 int8', prosrc => 'btint8cmp' }, { oid => '3131', descr => 'sort support', proname => 'btint8sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8sortsupport' }, +{ oid => '9292', descr => 'skip support', + proname => 'btint8skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint8skipsupport' }, { oid => '354', descr => 'less-equal-greater', proname => 'btfloat4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'float4 float4', prosrc => 'btfloat4cmp' }, @@ -1034,12 +1043,18 @@ { oid => '3134', descr => 'sort support', proname => 'btoidsortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidsortsupport' }, +{ oid => '9293', descr => 'skip support', + proname => 'btoidskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btoidskipsupport' }, { oid => '404', descr => 'less-equal-greater', proname => 'btoidvectorcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'oidvector oidvector', prosrc => 'btoidvectorcmp' }, { oid => '358', descr => 'less-equal-greater', proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'char char', prosrc => 'btcharcmp' }, +{ oid => '9294', descr => 'skip support', + proname => 'btcharskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btcharskipsupport' }, { oid => '359', descr => 'less-equal-greater', proname => 'btnamecmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'name name', prosrc => 'btnamecmp' }, @@ -2214,6 +2229,9 @@ { oid => '3136', descr => 'sort support', proname => 'date_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_sortsupport' }, +{ oid => '9295', descr => 'skip support', + proname => 'date_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'date_skipsupport' }, { oid => '4133', descr => 'window RANGE support', proname => 'in_range', prorettype => 'bool', proargtypes => 'date date interval bool bool', @@ -4368,6 +4386,9 @@ { oid => '1693', descr => 'less-equal-greater', proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'bool bool', prosrc => 'btboolcmp' }, +{ oid => '9296', descr => 'skip support', + proname => 'btboolskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btboolskipsupport' }, { oid => '1688', descr => 'hash', proname => 'time_hash', prorettype => 'int4', proargtypes => 'time', @@ -9175,6 +9196,9 @@ { oid => '3300', descr => 'sort support', proname => 'uuid_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_sortsupport' }, +{ oid => '9297', descr => 'skip support', + proname => 'uuid_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'uuid_skipsupport' }, { oid => '2961', descr => 'I/O', proname => 'uuid_recv', prorettype => 'uuid', proargtypes => 'internal', prosrc => 'uuid_recv' }, diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h new file mode 100644 index 000000000..a71a624d0 --- /dev/null +++ b/src/include/utils/skipsupport.h @@ -0,0 +1,140 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.h + * Support routines for B-Tree skip scans. + * + * B-Tree operator classes for discrete types can optionally provide a support + * function for skipping. This is used during skip scans. + * + * A B-tree operator class that implements skip support provides B-tree index + * scans with a way of enumerating and iterating through every possible value + * from the domain of indexable values. This gives scans a way to determine + * the next value in line for a given skip array/scan key/skipped attribute. + * This happens at the point where the scan determines that another primitive + * index scan is required. The next value is used (in combination with at + * least one additional lower-order non-skip key, taken from the SQL query) to + * relocate the scan, skipping over many irrelevant leaf pages in the process. + * + * There are many data types/opclasses where implementing a skip support + * scheme is inherently impossible (or at least impractical). Obviously, it + * would be wrong if the "next" value generated by an opclass was actually + * after the true next value (any index tuples with the true next value would + * be overlooked by the index scan). This partly explains why opclasses are + * under no obligation to implement skip support: a continuous type may have + * no way of generating a useful next value. + * + * Skip scan generally works best with discrete types such as integer, date, + * and boolean: types where we expect indexes to contain large groups of + * contiguous values (in respect of the leading/skipped index attribute). + * When gaps/discontinuities are naturally rare (e.g., a leading identity + * column in a composite index, a date column preceding a product_id column), + * then it makes sense for the skip scan to optimistically assume that the + * next distinct indexable value will find directly matching index tuples. + * The B-Tree code can fall back on explicit next-key probes for any opclass + * that doesn't include a skip support function, but it's best to provide skip + * support whenever possible. The B-Tree code assumes that it's always better + * to use the opclass skip support routine where available. + * + * When a skip scan "bets" that the next indexable value will find an exact + * match, there is significant upside, without any accompanying downside. + * When this optimistic strategy works out, the scan avoids the cost of an + * explicit probe (used in the no-skip-support case to determine the true next + * value in the index's skip attribute). When the strategy doesn't work out, + * then the scan is no worse off than it would have been without skip support. + * The explicit next-key probes used by B-Tree skip scan's fallback path are + * very similar to "failed" optimistic searches for the next indexable value + * (the next value according to the opclass skip support routine). + * + * (FIXME Actually, nbtree does no such thing right now, which is considered a + * blocker to commit.) + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/skipsupport.h + * + *------------------------------------------------------------------------- + */ +#ifndef SKIPSUPPORT_H +#define SKIPSUPPORT_H + +#include "utils/relcache.h" + +typedef struct SkipSupportData *SkipSupport; + +/* + * State/callbacks used by skip arrays to procedurally generate elements. + * + * A BTSKIPSUPPORT_PROC function must set each and every field when called. + * If an opclass can only set some of the fields, then it cannot safely + * provide a skip support routine (and so must rely on the fallback strategy + * used by continuous types, such as numeric). + */ +typedef struct SkipSupportData +{ + /* + * low_elem and high_elem must be set with the lowest and highest possible + * values from the domain of indexable values (assuming standard ascending + * order). This helps the B-Tree code with finding its initial position + * at the leaf level (during the skip scan's first primitive index scan). + * In other words, it gives the B-Tree code a useful value to start from, + * before any data has been read from the index. + * + * low_elem and high_elem can also be used to prove that a qual is + * unsatisfiable in certain cross-type scenarios. + * + * low_elem and high_elem are also used by skip scans to determine when + * they've reached the final possible value (in the current direction). + * It's typical for the scan to run out of leaf pages before it runs out + * of unscanned indexable values, but it's still useful for the scan to + * have a way to recognize when it has reached the last possible value + * (this saves us a useless probe that just lands on the final leaf page). + * + * Note: the logic for determining that the scan has reached the final + * possible value naturally belongs in the B-Tree code. The final value + * isn't necessarily the original high_elem/low_elem set by the opclass. + * In particular, it'll be a lower/higher value when B-Tree preprocessing + * determines that the true range of possible values should be restricted, + * due to the presence of an inequality applied to the index's skipped + * attribute. These are range skip scans. + */ + Datum low_elem; /* lowest sorting/leftmost non-NULL value */ + Datum high_elem; /* highest sorting/rightmost non-NULL value */ + + /* + * Decrement/increment functions. + * + * Returns a decremented/incremented copy of caller's existing datum, + * allocated in caller's memory context (in the case of pass-by-reference + * types). It's not okay for these functions to leak any memory. + * + * Both decrement and increment callbacks are guaranteed to never be + * called with a NULL "existing" arg. (In general it is the B-Tree code's + * job to worry about NULLs, and about whether indexed values are stored + * in ASC order or DESC order.) + * + * The decrement callback is guaranteed to only be called with an + * "existing" value that's strictly > the low_elem set by the opclass. + * Similarly, the increment callback is guaranteed to only be called with + * an "existing" value that's strictly < the high_elem set by the opclass. + * Consequently, opclasses don't have to deal with "overflow" themselves + * (though asserting that the B-Tree code got it right is a good idea). + * + * It's quite possible (and very common) for the B-Tree skip scan caller's + * "existing" datum to just be a straight copy of a value that it copied + * from the index. Operator classes must be liberal in accepting every + * possible representational variation within the underlying data type. + * Opclasses don't have to preserve whatever semantically insignificant + * information the data type might be carrying around, though. + * + * Note: < and > are defined by the opclass's ORDER proc in the usual way. + */ + Datum (*decrement) (Relation rel, Datum existing); + Datum (*increment) (Relation rel, Datum existing); +} SkipSupportData; + +extern bool PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, + bool reverse, SkipSupport sksup); + +#endif /* SKIPSUPPORT_H */ diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 1c72867c8..48a877613 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -58,6 +58,7 @@ #include #include "utils/fmgrprotos.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #ifdef STRESS_SORT_INT_MIN @@ -78,6 +79,39 @@ btboolcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32((int32) a - (int32) b); } +static Datum +bool_decrement(Relation rel, Datum existing) +{ + bool bexisting = DatumGetBool(existing); + + Assert(bexisting == true); + + return BoolGetDatum(bexisting - 1); +} + +static Datum +bool_increment(Relation rel, Datum existing) +{ + bool bexisting = DatumGetBool(existing); + + Assert(bexisting == false); + + return BoolGetDatum(bexisting + 1); +} + +Datum +btboolskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = bool_decrement; + sksup->increment = bool_increment; + sksup->low_elem = BoolGetDatum(false); + sksup->high_elem = BoolGetDatum(true); + + PG_RETURN_VOID(); +} + Datum btint2cmp(PG_FUNCTION_ARGS) { @@ -105,6 +139,39 @@ btint2sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int2_decrement(Relation rel, Datum existing) +{ + int16 iexisting = DatumGetInt16(existing); + + Assert(iexisting > PG_INT16_MIN); + + return Int16GetDatum(iexisting - 1); +} + +static Datum +int2_increment(Relation rel, Datum existing) +{ + int16 iexisting = DatumGetInt16(existing); + + Assert(iexisting < PG_INT16_MAX); + + return Int16GetDatum(iexisting + 1); +} + +Datum +btint2skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int2_decrement; + sksup->increment = int2_increment; + sksup->low_elem = Int16GetDatum(PG_INT16_MIN); + sksup->high_elem = Int16GetDatum(PG_INT16_MAX); + + PG_RETURN_VOID(); +} + Datum btint4cmp(PG_FUNCTION_ARGS) { @@ -128,6 +195,39 @@ btint4sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int4_decrement(Relation rel, Datum existing) +{ + int32 iexisting = DatumGetInt32(existing); + + Assert(iexisting > PG_INT32_MIN); + + return Int32GetDatum(iexisting - 1); +} + +static Datum +int4_increment(Relation rel, Datum existing) +{ + int32 iexisting = DatumGetInt32(existing); + + Assert(iexisting < PG_INT32_MAX); + + return Int32GetDatum(iexisting + 1); +} + +Datum +btint4skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int4_decrement; + sksup->increment = int4_increment; + sksup->low_elem = Int32GetDatum(PG_INT32_MIN); + sksup->high_elem = Int32GetDatum(PG_INT32_MAX); + + PG_RETURN_VOID(); +} + Datum btint8cmp(PG_FUNCTION_ARGS) { @@ -171,6 +271,39 @@ btint8sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int8_decrement(Relation rel, Datum existing) +{ + int64 iexisting = DatumGetInt64(existing); + + Assert(iexisting > PG_INT64_MIN); + + return Int64GetDatum(iexisting - 1); +} + +static Datum +int8_increment(Relation rel, Datum existing) +{ + int64 iexisting = DatumGetInt64(existing); + + Assert(iexisting < PG_INT64_MAX); + + return Int64GetDatum(iexisting + 1); +} + +Datum +btint8skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int8_decrement; + sksup->increment = int8_increment; + sksup->low_elem = Int64GetDatum(PG_INT64_MIN); + sksup->high_elem = Int64GetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum btint48cmp(PG_FUNCTION_ARGS) { @@ -292,6 +425,39 @@ btoidsortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +oid_decrement(Relation rel, Datum existing) +{ + Oid oexisting = DatumGetObjectId(existing); + + Assert(oexisting > InvalidOid); + + return ObjectIdGetDatum(oexisting - 1); +} + +static Datum +oid_increment(Relation rel, Datum existing) +{ + Oid oexisting = DatumGetObjectId(existing); + + Assert(oexisting < OID_MAX); + + return ObjectIdGetDatum(oexisting + 1); +} + +Datum +btoidskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = oid_decrement; + sksup->increment = oid_increment; + sksup->low_elem = ObjectIdGetDatum(InvalidOid); + sksup->high_elem = ObjectIdGetDatum(OID_MAX); + + PG_RETURN_VOID(); +} + Datum btoidvectorcmp(PG_FUNCTION_ARGS) { @@ -325,3 +491,38 @@ btcharcmp(PG_FUNCTION_ARGS) /* Be careful to compare chars as unsigned */ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); } + +static Datum +char_decrement(Relation rel, Datum existing) +{ + uint8 cexisting = UInt8GetDatum(existing); + + Assert(cexisting > 0); + + return CharGetDatum((uint8) cexisting - 1); +} + +static Datum +char_increment(Relation rel, Datum existing) +{ + uint8 cexisting = UInt8GetDatum(existing); + + Assert(cexisting < UCHAR_MAX); + + return CharGetDatum((uint8) cexisting + 1); +} + +Datum +btcharskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = char_decrement; + sksup->increment = char_increment; + + /* btcharcmp compares chars as unsigned */ + sksup->low_elem = UInt8GetDatum(0); + sksup->high_elem = UInt8GetDatum(UCHAR_MAX); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 686a3206f..9c9cd48f7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -324,11 +324,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); BTScanPosInvalidate(so->currPos); BTScanPosInvalidate(so->markPos); - if (scan->numberOfKeys > 0) - so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); - else - so->keyData = NULL; + so->keyData = NULL; so->needPrimScan = false; so->scanBehind = false; so->arrayKeys = NULL; @@ -408,6 +405,11 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ so->numArrayKeys = 0; /* ditto */ + + /* Release private storage allocated in previous btrescan, if any */ + if (so->keyData != NULL) + pfree(so->keyData); + so->keyData = NULL; } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index d6de2072d..f4442d014 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -28,10 +28,44 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/skipsupport.h" + +/* + * GUC parameter (temporary convenience for reviewers). + * + * To disable all skipping, set skipscan_prefix_cols=0. Otherwise set it to + * the attribute number that you wish to make the last attribute number that + * we can add a skip scan key for. + * + * For example, setting skipscan_prefix_cols=1 before an index scan with qual + * "WHERE b = 1 AND c > 42" will make us generate a skip scan key on the + * column 'a' (which is attnum 1) only, preventing us from adding one for the + * column 'c' (and so 'c' will still have an inequality scan key, required in + * only one direction -- 'c' won't be output as a "range" skip key/array). + * + * The same scan keys will be output when skipscan_prefix_cols=2, given the + * same query/qual, since we naturally get a required equality scan key on 'b' + * from the input scan keys (provided we at least manage to add a skip scan + * key on 'a' that "anchors its required-ness" to the 'b' scan key.) + * + * When skipscan_prefix_cols is set to the number of key columns in the index, + * we're as aggressive as possible about adding skip scan arrays/scan keys. + * This is the current default behavior, and the behavior we're targeting for + * the committed patch (if there are slowdowns from being maximally aggressive + * here then the likely solution is to make _bt_advance_array_keys adaptive, + * rather than trying to predict what will work during preprocessing). + */ +int skipscan_prefix_cols; #define LOOK_AHEAD_REQUIRED_RECHECKS 3 #define LOOK_AHEAD_DEFAULT_DISTANCE 5 +typedef struct BTSkipPreproc +{ + SkipSupportData sksup; /* opclass skip scan support */ + Oid eq_op; /* InvalidOid means don't skip */ +} BTSkipPreproc; + typedef struct BTSortArrayContext { FmgrInfo *sortproc; @@ -62,18 +96,49 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok); -static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan); +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); +static int _bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts); +static bool _bt_skip_support(Relation rel, int add_skip_attno, + BTSkipPreproc *skipatts); +static inline Datum _bt_apply_decrement(Relation rel, ScanKey skey, + BTArrayKeyInfo *array); +static inline Datum _bt_apply_increment(Relation rel, ScanKey skey, + BTArrayKeyInfo *array); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur); + Datum arrdatum, bool arrnull, + ScanKey cur); +static void _bt_apply_compare_array(ScanKey arraysk, ScanKey skey, + FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); +static void _bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk, + ScanKey skey, FmgrInfo *orderproc, + FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); static int _bt_binsrch_array_skey(FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); +static void _bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static void _bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static void _bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull); +static void _bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static bool _bt_advance_skip_array_key_increment(Relation rel, ScanDirection dir, + BTArrayKeyInfo *array, ScanKey skey, + FmgrInfo *orderproc); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, @@ -251,9 +316,6 @@ _bt_freestack(BTStack stack) * It is convenient for _bt_preprocess_keys caller to have to deal with no * more than one equality strategy array scan key per index attribute. We'll * always be able to set things up that way when complete opfamilies are used. - * Eliminated array scan keys can be recognized as those that have had their - * sk_strategy field set to InvalidStrategy here by us. Caller should avoid - * including these in the scan's so->keyData[] output array. * * We set the scan key references from the scan's BTArrayKeyInfo info array to * offsets into the temp modified input array returned to caller. Scans that @@ -261,18 +323,36 @@ _bt_freestack(BTStack stack) * preprocessing steps are complete. This will convert the scan key offset * references into references to the scan's so->keyData[] output scan keys. * + * We're also responsible for generating skip arrays (and their associated + * scan keys) here. This enables skip scan. We do this for index attributes + * that initially lacked an equality condition within scan->keyData[], iff + * doing so allows a later scan key (that was passed to us in scan->keyData[]) + * to be marked required by later preprocessing on output. + * _bt_decide_skipatts decides which attributes receive skip arrays. + * + * Caller must pass *numberOfKeys to give us a way to change the number of + * input scan keys (our output is caller's input). The returned array can be + * smaller than scan->keyData[] when we eliminated a redundant array scan key + * (redundant with some other array scan key, for the same attribute). It can + * also be larger when we added a skip array/skip scan key. Caller uses this + * to allocate so->keyData[] for the current btrescan. + * * Note: the reason we need to return a temp scan key array, rather than just * scribbling on scan->keyData, is that callers are permitted to call btrescan * without supplying a new set of scankey data. */ static ScanKey -_bt_preprocess_array_keys(IndexScanDesc scan) +_bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - int numberOfKeys = scan->numberOfKeys; + int numArrayKeyData = scan->numberOfKeys; int16 *indoption = rel->rd_indoption; - int numArrayKeys; + BTSkipPreproc skipatts[INDEX_MAX_KEYS]; + int numArrayKeys, + numSkipArrayKeys, + output_ikey = 0; + AttrNumber attno_skip = 1; int origarrayatt = InvalidAttrNumber, origarraykey = -1; Oid origelemtype = InvalidOid; @@ -280,11 +360,14 @@ _bt_preprocess_array_keys(IndexScanDesc scan) MemoryContext oldContext; ScanKey arrayKeyData; /* modified copy of scan->keyData */ - Assert(numberOfKeys); + Assert(scan->numberOfKeys); - /* Quick check to see if there are any array keys */ + /* + * Quick check to see if there are any array keys, or any missing keys we + * can generate a "skip scan" array key for ourselves + */ numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) + for (int i = 0; i < scan->numberOfKeys; i++) { cur = &scan->keyData[i]; if (cur->sk_flags & SK_SEARCHARRAY) @@ -300,6 +383,18 @@ _bt_preprocess_array_keys(IndexScanDesc scan) } } + /* Consider generating skip arrays, and associated equality scan keys */ + numSkipArrayKeys = _bt_decide_skipatts(scan, skipatts); + if (numSkipArrayKeys) + { + /* At least one skip array scan key must be added to arrayKeyData[] */ + numArrayKeys += numSkipArrayKeys; + /* output scan key buffer allocation needs space for skip scan keys */ + numArrayKeyData += numSkipArrayKeys; + + elog(DEBUG2, "skipping %d index attributes", numSkipArrayKeys); + } + /* Quit if nothing to do. */ if (numArrayKeys == 0) return NULL; @@ -317,19 +412,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan) oldContext = MemoryContextSwitchTo(so->arrayContext); - /* Create modifiable copy of scan->keyData in the workspace context */ - arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); - memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData)); + /* Create output scan keys in the workspace context */ + arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ - so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); + so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo)); - /* Now process each array key */ + /* + * Process each array key, and generate skip arrays as needed. Also copy + * every scan->keyData[] input scan key (whether it's an array or not) + * into the arrayKeyData array we'll return to our caller (barring any + * array scan keys that we could eliminate early through array merging). + */ numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) + for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++) { FmgrInfo sortproc; FmgrInfo *sortprocp = &sortproc; @@ -345,14 +444,78 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int num_nonnulls; int j; - cur = &arrayKeyData[i]; - if (!(cur->sk_flags & SK_SEARCHARRAY)) - continue; + /* Create a skip array and scan key where indicated by skipatts */ + while (numSkipArrayKeys && + attno_skip <= scan->keyData[input_ikey].sk_attno) + { + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + Oid collation = rel->rd_indcollation[attno_skip - 1]; + Oid eq_op = skipatts[attno_skip - 1].eq_op; + RegProcedure cmp_proc; + + if (!OidIsValid(eq_op)) + { + /* won't skip using this attribute */ + attno_skip++; + continue; + } + + cmp_proc = get_opcode(eq_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op); + + cur = &arrayKeyData[output_ikey]; + Assert(attno_skip <= scan->keyData[input_ikey].sk_attno); + ScanKeyEntryInitialize(cur, + SK_SEARCHARRAY | SK_BT_SKIP, /* flags */ + attno_skip, /* skipped att number */ + BTEqualStrategyNumber, /* equality strategy */ + InvalidOid, /* opclass input subtype */ + collation, /* index column's collation */ + cmp_proc, /* equality operator's proc */ + (Datum) 0); /* constant */ + + /* Initialize array fields */ + so->arrayKeys[numArrayKeys].scan_key = output_ikey; + so->arrayKeys[numArrayKeys].num_elems = -1; + so->arrayKeys[numArrayKeys].cur_elem = 0; + so->arrayKeys[numArrayKeys].elem_values = NULL; /* unusued */ + so->arrayKeys[numArrayKeys].sksup = skipatts[attno_skip - 1].sksup; + so->arrayKeys[numArrayKeys].null_elem = true; /* for now */ + + /* + * We'll need a 3-way ORDER proc to determine when and how the + * consed-up "array" will advance inside _bt_advance_array_keys. + * Set one up now. + */ + _bt_setup_array_cmp(scan, cur, opcintype, + &so->orderProcs[output_ikey], NULL); + + /* + * Prepare to output next scan key (might be another skip scan + * key, or it could be an input scan key from scan->keyData[]) + */ + numSkipArrayKeys--; + numArrayKeys++; + attno_skip++; + output_ikey++; /* keep this scan key/array */ + } /* - * First, deconstruct the array into elements. Anything allocated - * here (including a possibly detoasted array value) is in the - * workspace context. + * Copy input scan key into temp arrayKeyData scan key array. (From + * here on, cur points at our copy of the input scan key.) + */ + cur = &arrayKeyData[output_ikey]; + *cur = scan->keyData[input_ikey]; + + if (!(cur->sk_flags & SK_SEARCHARRAY)) + { + output_ikey++; /* keep this non-array scan key */ + continue; + } + + /* + * Deconstruct the array into elements */ arrayval = DatumGetArrayTypeP(cur->sk_argument); /* We could cache this data, but not clear it's worth it */ @@ -406,6 +569,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); + output_ikey++; /* keep this transformed scan key */ continue; case BTEqualStrategyNumber: /* proceed with rest of loop */ @@ -416,6 +580,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); + output_ikey++; /* keep this transformed scan key */ continue; default: elog(ERROR, "unrecognized StrategyNumber: %d", @@ -432,7 +597,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * sortproc just points to the same proc used during binary searches. */ _bt_setup_array_cmp(scan, cur, elemtype, - &so->orderProcs[i], &sortprocp); + &so->orderProcs[output_ikey], &sortprocp); /* * Sort the non-null elements and eliminate any duplicates. We must @@ -476,11 +641,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) break; } - /* - * Indicate to _bt_preprocess_keys caller that it must ignore - * this scan key - */ - cur->sk_strategy = InvalidStrategy; + /* Throw away this array */ continue; } @@ -511,12 +672,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * Note: _bt_preprocess_array_keys_final will fix-up each array's * scan_key field later on, after so->keyData[] has been finalized. */ - so->arrayKeys[numArrayKeys].scan_key = i; + so->arrayKeys[numArrayKeys].scan_key = output_ikey; so->arrayKeys[numArrayKeys].num_elems = num_elems; so->arrayKeys[numArrayKeys].elem_values = elem_values; + so->arrayKeys[numArrayKeys].null_elem = false; /* unused */ numArrayKeys++; + output_ikey++; /* keep this scan key/array */ } + /* Set final number of arrayKeyData[] keys, array keys */ + *numberOfKeys = output_ikey; so->numArrayKeys = numArrayKeys; MemoryContextSwitchTo(oldContext); @@ -624,7 +789,8 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) { BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; - Assert(array->num_elems > 0); + Assert(array->num_elems > 0 || array->num_elems == -1); + Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD); if (array->scan_key == input_ikey) { @@ -685,6 +851,245 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) so->numArrayKeys, INDEX_MAX_KEYS))); } +/* + * _bt_decide_skipatts() -- set index attributes requiring skip arrays + * + * _bt_preprocess_array_keys helper function. Determines which attributes + * will require skip arrays/scan keys. Also sets up skip support function for + * each of these attributes. + * + * This sets up "skip scan". Adding skip arrays (and associated scan keys) + * allows _bt_preprocess_keys to mark lower-order scan keys (copied from the + * original scan->keyData[] array in the conventional way) as required. The + * overall effect is to enable skipping over irrelevant sections of the index. + * + * Return value is the total number of scan keys to add as "input" scan keys + * for further processing within _bt_preprocess_keys. + */ +static int +_bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts) +{ + Relation rel = scan->indexRelation; + ScanKey inputsk; + AttrNumber attno_inputsk = 1, + attno_skip = 1; + bool attno_has_equal = false, + attno_has_rowcompare = false; + int numSkipArrayKeys = 0, + prev_numSkipArrayKeys = 0; + + Assert(scan->numberOfKeys); + + /* + * XXX Don't support system catalogs for now. Calls to routines like + * get_opfamily_member() are prone to infinite recursion, which we'll need + * to find workaround for (hard-coded lookups?). + */ + if (IsCatalogRelation(rel)) + return 0; + + /* + * FIXME Also don't support parallel scans for now. Must add logic to + * places like _bt_parallel_primscan_schedule so that we account for skip + * arrays when parallel workers serialize their array scan state. + */ + if (scan->parallel_scan) + return 0; + + inputsk = &scan->keyData[0]; + for (int i = 0;; inputsk++, i++) + { + /* + * Backfill skip arrays for any wholly omitted attributes prior to + * attno_inputsk + */ + while (attno_skip < attno_inputsk) + { + if (!_bt_skip_support(rel, attno_skip, &skipatts[attno_skip - 1])) + { + /* + * Opclass lacks a suitable skip support routine. + * + * Return prev_numSkipArrayKeys, so as to avoid including any + * "backfilled" arrays that were supposed to form a contiguous + * group with a skip array on this attribute. There is no + * benefit to adding backfill skip arrays unless we can do so + * for all attributes (all attributes up to and including the + * one immediately before attno_inputsk). + */ + return prev_numSkipArrayKeys; + } + + /* plan on adding a backfill skip array for this attribute */ + numSkipArrayKeys++; + attno_skip++; + } + + /* + * Stop once past the final input scan key. We deliberately never add + * a skip attribute for the attribute of the last input scan key. + * + * If the last input scan key(s) use equality strategy, then a skip + * attribute is superfluous at best. If the last input scan key uses + * an inequality strategy, then adding a skip scan array/scan key is a + * valid though suboptimal transformation. It is better to arrange + * for preprocessing to allow such an input inequality scan key to + * remain an inequality on output. That way _bt_checkkeys will be + * able to make best use of both of its precheck optimizations, but + * _bt_first will be no less capable of efficiently finding the + * starting position for each primitive index scan. + */ + if (i >= scan->numberOfKeys) + break; + + /* + * Cannot keep adding skip arrays after a RowCompare + */ + if (attno_has_rowcompare) + break; + + /* + * Apply temporary testing GUC that can be used to disable skipping + * (either in part or in whole) + */ + if (attno_inputsk > skipscan_prefix_cols) + break; + + /* + * Now consider next attno_inputsk (or keep going if this is an + * additional scan key against the same attribute) + */ + if (attno_inputsk < inputsk->sk_attno) + { + prev_numSkipArrayKeys = numSkipArrayKeys; + + /* + * Now add skip array for previous scan key's attribute, though + * only if the attribute has no equality strategy scan keys. + * + * Adding skip arrays to an attribute that has one or more + * inequality scan keys will cause preprocessing to output a range + * skip array. This will happen when preprocessing proper deals + * with the redundancy between the array and its inequalities. + */ + skipatts[attno_skip - 1].eq_op = InvalidOid; + if (!attno_has_equal) + { + /* Only saw inequalities for the prior attribute */ + if (_bt_skip_support(rel, attno_skip, + &skipatts[attno_skip - 1])) + { + /* add a range skip array for this attribute */ + numSkipArrayKeys++; + } + else + break; + } + else + { + /* + * Saw an equality for the prior attribute, so it doesn't need + * a skip array (not even a range skip array). We'll be able + * to add later skip arrays, too (doesn't matter if the prior + * attribute uses an input opclass without skip support). + */ + } + + /* Set things up for this new attribute */ + attno_skip++; + attno_inputsk = inputsk->sk_attno; + attno_has_equal = false; + } + + /* + * Track if this scan key's attribute has any equality strategy scan + * keys. + * + * Treat IS NULL scan keys as using equal strategy (they'll be marked + * as using it later on, by _bt_fix_scankey_strategy). + */ + if (inputsk->sk_strategy == BTEqualStrategyNumber || + (inputsk->sk_flags & SK_SEARCHNULL)) + attno_has_equal = true; + + /* + * We don't support RowCompare transformation. Remember that we saw a + * RowCompare, so that we don't keep adding skip attributes. + * + * We do still backfill skip attributes before the RowCompare, so that + * it can be marked required. This is similar to what happens when a + * conventional inequality uses an opclass that lacks skip support. + */ + if (inputsk->sk_flags & SK_ROW_HEADER) + attno_has_rowcompare = true; + } + + return numSkipArrayKeys; +} + +/* + * _bt_skip_support() -- set up skip support function in *skipatts + * + * Returns true on success, indicating that we set *skipatts with input + * opclass's skip support routine for caller. Otherwise returns false. + */ +static bool +_bt_skip_support(Relation rel, int add_skip_attno, BTSkipPreproc *skipatts) +{ + int16 *indoption = rel->rd_indoption; + Oid opfamily = rel->rd_opfamily[add_skip_attno - 1]; + Oid opcintype = rel->rd_opcintype[add_skip_attno - 1]; + bool reverse; + + /* Look up input opclass's equality operator */ + skipatts->eq_op = get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + /* + * We don't expect input opclasses lacking even an equality operator, but + * it's possible. Deal with it gracefully. + */ + if (!OidIsValid(skipatts->eq_op)) + return false; + + /* Have skip support infrastructure set all SkipSupport fields */ + reverse = (indoption[add_skip_attno - 1] & INDOPTION_DESC) != 0; + return PrepareSkipSupportFromOpclass(opfamily, opcintype, reverse, + &skipatts->sksup); +} + +/* + * _bt_apply_decrement() -- Get a decremented copy of skey's arg + * + * Note: this wrapper function calls the opclass increment function when the + * index stores values in descending order. We're "logically decrementing" to + * the previous value in the key space regardless. + */ +static inline Datum +_bt_apply_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + if (!(skey->sk_flags & SK_BT_DESC)) + return array->sksup.decrement(rel, skey->sk_argument); + else + return array->sksup.increment(rel, skey->sk_argument); +} + +/* + * _bt_apply_increment() -- Get an incremented copy of skey's arg + * + * Note: this wrapper function calls the opclass decrement function when the + * index stores values in descending order. We're "logically incrementing" to + * the next value in the key space regardless. + */ +static inline Datum +_bt_apply_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + if (!(skey->sk_flags & SK_BT_DESC)) + return array->sksup.increment(rel, skey->sk_argument); + else + return array->sksup.decrement(rel, skey->sk_argument); +} + /* * _bt_setup_array_cmp() -- Set up array comparison functions * @@ -979,15 +1384,10 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey { Relation rel = scan->indexRelation; Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; - int cmpresult = 0, - cmpexact = 0, - matchelem, - new_nelems = 0; FmgrInfo crosstypeproc; FmgrInfo *orderprocp = orderproc; Assert(arraysk->sk_attno == skey->sk_attno); - Assert(array->num_elems > 0); Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); Assert((arraysk->sk_flags & SK_SEARCHARRAY) && arraysk->sk_strategy == BTEqualStrategyNumber); @@ -1000,8 +1400,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * datum of opclass input type for the index's attribute (on-disk type). * We can reuse the array's ORDER proc whenever the non-array scan key's * type is a match for the corresponding attribute's input opclass type. - * Otherwise, we have to do another ORDER proc lookup so that our call to - * _bt_binsrch_array_skey applies the correct comparator. + * Otherwise, we have to do another ORDER proc lookup. We have to be sure + * that _bt_compare_array_skey/_bt_binsrch_array_skey use the right proc. * * Note: we have to support the convention that sk_subtype == InvalidOid * means the opclass input type; this is a hack to simplify life for @@ -1032,11 +1432,46 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey return false; } - /* We have all we need to determine redundancy/contradictoriness */ orderprocp = &crosstypeproc; fmgr_info(cmp_proc, orderprocp); } + /* + * We have all we need to determine redundancy/contradictoriness. + * + * Perform preprocessing of the array based on whether it's a conventional + * array, or a skip array. Sets *qual_ok correctly in passing. + */ + if (array->num_elems != -1) + _bt_apply_compare_array(arraysk, skey, + orderprocp, array, qual_ok); + else + _bt_apply_compare_skiparray(scan, arraysk, skey, + orderproc, orderprocp, + array, qual_ok); + + return true; +} + +/* + * Finish off preprocessing of conventional (non-skip) array scan key when it + * is redundant with (or contradicted by) a non-array scalar scan key. + * + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + */ +static void +_bt_apply_compare_array(ScanKey arraysk, ScanKey skey, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok) +{ + int cmpresult = 0, + cmpexact = 0, + matchelem, + new_nelems = 0; + + Assert(array->num_elems > 0); + Assert(!(arraysk->sk_flags & SK_BT_SKIP)); + matchelem = _bt_binsrch_array_skey(orderprocp, false, NoMovementScanDirection, skey->sk_argument, false, array, @@ -1088,8 +1523,152 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey array->num_elems = new_nelems; *qual_ok = new_nelems > 0; +} - return true; +/* + * Finish off preprocessing of skip array scan key when it is redundant with + * (or contradicted by) a non-array scalar scan key. + * + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + * + * Arrays used to skip (skip scan/missing key attribute predicates) work by + * procedurally generating their elements on the fly. We must still + * "eliminate contradictory elements", but it works a little differently: we + * narrow the range of the skip array, such that the array will never + * generated contradicted-by-skey elements. + * + * FIXME Our behavior in scenarios with cross-type operators (range skip scan + * cases) is buggy. We're naively copying datums of a different type from + * scalar inequality scan keys into the array's low_value and high_value + * fields. In practice this tends to not visibly break (in practice types + * that appear within the same operator family tend to have compatible datum + * representations, at least on systems with little-endian byte order). Put + * off dealing with the problem until a later revision of the patch. + * + * It seems likely that the best way to fix this problem will involve keeping + * around the original operator in the BTArrayKeyInfo array struct whenever + * we're passed a "redundant" cross-type inequality operator (an approach + * involving casts/coercions might be tempting, but seems much too fragile). + * We only need to use not-column-input-opclass-type operators for the first + * and/or last array elements from the skip array under this scheme; we'll + * still mostly be dealing with opcintype-typed datums, copied from the index + * (as well as incrementing/decrementing copies of those index tuple datums). + * Importantly, this scheme should work just as well with an opfamily that + * doesn't even have an orderprocp cross-type ORDER operator to pass us here + * (we might even have to keep more than one same-strategy inequality, since + * in general _bt_preprocess_keys might not be able to prove which inequality + * is redundant). + */ +static void +_bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Form_pg_attribute attr = TupleDescAttr(RelationGetDescr(rel), + skey->sk_attno - 1); + MemoryContext oldContext; + int cmpresult; + + /* + * We don't expect to have to deal with NULLs in non-array/non-skip scan + * key. We expect _bt_preprocess_array_keys to avoid generating a skip + * array for an index attribute with an IS NULL input scan key. It will + * still do so in the presence of IS NOT NULL input scan keys, but + * _bt_compare_scankey_args is expected to handle those for us. + */ + Assert(arraysk->sk_flags & SK_BT_SKIP); + Assert(arraysk->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & SK_ISNULL)); + Assert(array->num_elems == -1); + + /* + * Scalar scan key must be a B-Tree operator, which must always be strict. + * Array shouldn't generate a NULL "array element"/an IS NULL qual. This + * isn't just an optimization; it's strictly necessary for correctness. + */ + array->null_elem = false; + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + + /* + * detect if scan key argument will be < low_value once + * decremented + */ + cmpresult = _bt_compare_array_skey(orderprocp, + skey->sk_argument, false, + array->sksup.low_elem, false, + arraysk); + if (cmpresult <= 0) + { + /* decrementing would make qual unsatisfiable, so don't try */ + *qual_ok = false; + return; + } + + /* decremented scan key value becomes skip array's new high_value */ + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.high_elem = _bt_apply_decrement(rel, skey, array); + MemoryContextSwitchTo(oldContext); + break; + case BTLessEqualStrategyNumber: + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.high_elem = datumCopy(skey->sk_argument, + attr->attbyval, attr->attlen); + MemoryContextSwitchTo(oldContext); + break; + case BTEqualStrategyNumber: + /* _bt_preprocess_array_keys should have avoided this */ + elog(ERROR, "equality strategy scan key conflicts with skip key for attribute %d on index \"%s\"", + skey->sk_attno, RelationGetRelationName(rel)); + break; + case BTGreaterEqualStrategyNumber: + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.low_elem = datumCopy(skey->sk_argument, + attr->attbyval, attr->attlen); + MemoryContextSwitchTo(oldContext); + break; + case BTGreaterStrategyNumber: + + /* + * detect if scan key argument will be > high_value once + * incremented + */ + cmpresult = _bt_compare_array_skey(orderprocp, + skey->sk_argument, false, + array->sksup.high_elem, false, + arraysk); + if (cmpresult >= 0) + { + /* incrementing would make qual unsatisfiable, so don't try */ + *qual_ok = false; + return; + } + + /* incremented scan key value becomes skip array's new low_value */ + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.low_elem = _bt_apply_increment(rel, skey, array); + MemoryContextSwitchTo(oldContext); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + /* + * Is the qual contradictory, or is it merely "redundant" with consed-up + * skip array? + */ + cmpresult = _bt_compare_array_skey(orderproc, /* don't use orderprocp */ + array->sksup.low_elem, false, + array->sksup.high_elem, false, + arraysk); + *qual_ok = (cmpresult <= 0); } /* @@ -1130,7 +1709,8 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur) + Datum arrdatum, bool arrnull, + ScanKey cur) { int32 result = 0; @@ -1138,14 +1718,14 @@ _bt_compare_array_skey(FmgrInfo *orderproc, if (tupnull) /* NULL tupdatum */ { - if (cur->sk_flags & SK_ISNULL) + if (arrnull) result = 0; /* NULL "=" NULL */ else if (cur->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } - else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + else if (arrnull) /* NOT_NULL tupdatum, NULL arrdatum */ { if (cur->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ @@ -1211,6 +1791,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum arrdatum; Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* plain arrays can't do this */ Assert(cur->sk_strategy == BTEqualStrategyNumber); if (cur_elem_trig) @@ -1246,7 +1828,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[low_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result <= 0) { @@ -1274,7 +1856,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[high_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result >= 0) { @@ -1301,7 +1883,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, arrdatum = array->elem_values[mid_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result == 0) { @@ -1326,13 +1908,123 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, */ if (low_elem != mid_elem) result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - array->elem_values[low_elem], cur); + array->elem_values[low_elem], false, + cur); *set_elem_result = result; return low_elem; } +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * Skip scan arrays procedurally generate their elements on-demand. They + * largely function in the same way as standard arrays. They can be rolled + * over by standard arrays (standard array can also roll over skip arrays). + * + * This routine doesn't return an index into the array, because the array + * doesn't actually have any elements (it has low_value and high_value, which + * indicate the range of values that the array can generate). Note that this + * may include a NULL value/an IS NULL qual (unlike with true arrays). + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. Other values indicate what _bt_compare_array_skey returned + * for the best available match to tupdatum/tupnull (in practice this means + * either the lowest item or the highest item in the range of the array). + * + * cur_elem_trig indicates if array advancement was triggered by this skip + * array's scan key. We can apply this information to find the next matching + * array element in the current scan direction using fewer comparisons. + */ +static void +_bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Datum arrdatum; + bool arrnull; + + Assert(!ScanDirectionIsNoMovement(dir)); + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + + /* + * Compare tupdatum against "first array element" in the current scan + * direction first (and allow NULL to be treated as a possible element). + * + * Optimization: don't have to bother with this when passed a skip array + * that is known to have triggered array advancement. + */ + if (!cur_elem_trig) + { + if (ScanDirectionIsForward(dir)) + { + arrdatum = array->sksup.low_elem; + arrnull = array->null_elem && (cur->sk_flags & SK_BT_NULLS_FIRST); + } + else + { + arrdatum = array->sksup.high_elem; + arrnull = array->null_elem && !(cur->sk_flags & SK_BT_NULLS_FIRST); + } + + *set_elem_result = _bt_compare_array_skey(orderproc, + tupdatum, tupnull, + arrdatum, arrnull, + cur); + + /* + * Optimization: return early when >= lower bound happens to be an + * exact match (or when <= upper bound is an exact match during a + * backwards scan) + */ + if (*set_elem_result == 0) + return; + + /* Is tupdatum "before the start" of our lowest "element"? */ + if ((ScanDirectionIsForward(dir) && *set_elem_result < 0) || + (ScanDirectionIsBackward(dir) && *set_elem_result > 0)) + return; + } + + /* + * Now compare tupdatum to the "last array element" in the current scan + * direction (and allow NULL to be treated as a possible element) + */ + if (ScanDirectionIsForward(dir)) + { + arrdatum = array->sksup.high_elem; + arrnull = array->null_elem && !(cur->sk_flags & SK_BT_NULLS_FIRST); + } + else + { + arrdatum = array->sksup.low_elem; + arrnull = array->null_elem && (cur->sk_flags & SK_BT_NULLS_FIRST); + } + + *set_elem_result = _bt_compare_array_skey(orderproc, + tupdatum, tupnull, + arrdatum, arrnull, + cur); + + /* Is tupdatum "after the end" of our highest "element"? */ + if ((ScanDirectionIsForward(dir) && *set_elem_result > 0) || + (ScanDirectionIsBackward(dir) && *set_elem_result < 0)) + return; + + /* + * tupdatum must be within the range of the skip array. Have our caller + * treat tupdatum as one of the array's elements. + */ + *set_elem_result = 0; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -1342,29 +2034,257 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; Assert(so->numArrayKeys); Assert(so->qual_ok); - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->keyData[curArrayKey->scan_key]; - Assert(curArrayKey->num_elems > 0); Assert(skey->sk_flags & SK_SEARCHARRAY); - if (ScanDirectionIsBackward(dir)) - curArrayKey->cur_elem = curArrayKey->num_elems - 1; - else - curArrayKey->cur_elem = 0; - skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + _bt_scankey_set_low_or_high(rel, skey, curArrayKey, + ScanDirectionIsForward(dir)); } so->scanBehind = false; } +/* + * _bt_scankey_decrement() -- decrement scan key's sk_argument + * + * Unsets scan key "IS NULL" flags when required, and handles memory + * management for pass-by-reference types. + */ +static void +_bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (skey->sk_flags & SK_ISNULL) + _bt_scankey_unset_isnull(rel, skey, array); + else + { + Datum dec_sk_argument; + Form_pg_attribute attr; + + /* Get a decremented copy of existing sk_argument */ + dec_sk_argument = _bt_apply_decrement(rel, skey, array); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set decremented copy of original sk_argument in scan key */ + skey->sk_argument = dec_sk_argument; + } +} + +/* + * _bt_scankey_increment() -- increment scan key's sk_argument + * + * Unsets scan key "IS NULL" flags when required, and handles memory + * management for pass-by-reference types. + */ +static void +_bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (skey->sk_flags & SK_ISNULL) + _bt_scankey_unset_isnull(rel, skey, array); + else + { + Datum inc_sk_argument; + Form_pg_attribute attr; + + /* Get an incremented copy of existing sk_argument */ + inc_sk_argument = _bt_apply_increment(rel, skey, array); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set incremented copy of original sk_argument in scan key */ + skey->sk_argument = inc_sk_argument; + } +} + +/* + * _bt_scankey_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_scankey_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for conventional array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Set element to NULL (lowest/highest element) */ + skey->sk_argument = (Datum) 0; + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else + { + /* Lowest array element isn't NULL */ + if (low_not_high) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); + + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + } +} + +/* + * _bt_scankey_set_element() -- Set skip array scan key's sk_argument + * + * Sets scan key to "IS NULL" when required, and handles memory management for + * pass-by-reference types. + */ +static void +_bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull) +{ + /* tupdatum within the range of low_value/high_value */ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* + * Treat tupdatum/tupnull as a matching array element. + * + * We just copy tupdatum into the array's scan key (there is no + * conventional array element for us to set, of course). + */ + if (tupnull) + { + /* + * Unlike standard arrays, skip arrays sometimes need to locate NULLs. + * We can treat them as just another value from the domain of indexed + * values. + */ + Assert(array->null_elem); + Assert(!(skey->sk_flags & (SK_SEARCHNULL | SK_ISNULL))); + + skey->sk_argument = (Datum) 0; + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else + { + skey->sk_argument = datumCopy(tupdatum, + attr->attbyval, attr->attlen); + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + } +} + +/* + * _bt_scankey_unset_isnull() -- increment/decrement scan key from NULL + * + * Unsets scan key's "IS NULL" marking, and sets the non-NULL value from the + * array immediately before (or immediate after) NULL in the key space. + */ +static void +_bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(skey->sk_flags & SK_ISNULL); + Assert(array->null_elem); + + /* + * sk_argument must be set to whatever non-NULL value comes immediately + * before or after NULL + */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + if (skey->sk_flags & SK_BT_NULLS_FIRST) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); +} + +/* + * _bt_scankey_set_isnull() -- increment/decrement scan key to NULL + * + * Sets scan key to "IS NULL", and handles memory management for + * pass-by-reference types. + */ +static void +_bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & SK_ISNULL)); + Assert(array->null_elem); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set sk_argument to NULL */ + skey->sk_argument = (Datum) 0; + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + /* * _bt_advance_array_keys_increment() -- Advance to next set of array elements * @@ -1380,6 +2300,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; /* @@ -1391,10 +2312,24 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->keyData[curArrayKey->scan_key]; + FmgrInfo *orderproc = &so->orderProcs[curArrayKey->scan_key]; int cur_elem = curArrayKey->cur_elem; int num_elems = curArrayKey->num_elems; bool rolled = false; + /* Handle incrementing a skip array */ + if (num_elems == -1) + { + /* Attempt to incrementally advance this skip scan array */ + if (_bt_advance_skip_array_key_increment(rel, dir, curArrayKey, + skey, orderproc)) + return true; + + /* Array rolled over. Need to advance next array key, if any. */ + continue; + } + + /* Handle incrementing a true array */ if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) { cur_elem = 0; @@ -1411,7 +2346,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) if (!rolled) return true; - /* Need to advance next array key, if any */ + /* Array rolled over. Need to advance next array key, if any. */ } /* @@ -1429,6 +2364,95 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) return false; } +/* + * _bt_advance_skip_array_key_increment() -- increment a skip scan array + * + * Returns true when the skip array was successfully incremented to the next + * value in the current scan direction, dir. Otherwise handles roll over by + * setting array to its final element for the current scan direction. + */ +static bool +_bt_advance_skip_array_key_increment(Relation rel, ScanDirection dir, + BTArrayKeyInfo *array, ScanKey skey, + FmgrInfo *orderproc) +{ + Datum sk_argument = skey->sk_argument; + bool sk_isnull = (skey->sk_flags & SK_ISNULL) != 0; + int compare; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->num_elems == -1); + + if (ScanDirectionIsForward(dir)) + { + /* high_elem is final non-NULL element in current scan direction */ + compare = _bt_compare_array_skey(orderproc, + array->sksup.high_elem, false, + sk_argument, sk_isnull, + skey); + if (compare > 0) + { + /* Increment non-NULL element to next non-NULL element */ + _bt_scankey_increment(rel, skey, array); + + return true; + } + else if (compare == 0 && array->null_elem && + !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument was already equal to high_elem. Increment + * from high_elem to final NULL element (without calling opclass + * support function, which doesn't know how to handle NULLs). + */ + _bt_scankey_set_isnull(rel, skey, array); + + return true; + } + + /* Exhausted all array elements in current scan direction */ + } + else + { + /* low_elem is final non-NULL element in current scan direction */ + compare = _bt_compare_array_skey(orderproc, + array->sksup.low_elem, false, + sk_argument, sk_isnull, + skey); + if (compare < 0) + { + /* Decrement non-NULL element to previous non-NULL element */ + _bt_scankey_decrement(rel, skey, array); + + return true; + } + else if (compare == 0 && array->null_elem && + (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument was already equal to low_elem. Decrement + * from low_elem to final NULL element (without calling opclass + * support function, which doesn't know how to handle NULLs). + */ + _bt_scankey_set_isnull(rel, skey, array); + + return true; + } + + /* Exhausted all array elements in current scan direction */ + } + + /* + * Skip array rolls over. Start over at the array's lowest sorting value + * (or its highest value, for backward scans). + */ + _bt_scankey_set_low_or_high(rel, skey, array, ScanDirectionIsForward(dir)); + + /* Caller must consider earlier/more significant arrays in turn */ + return false; +} + /* * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays * @@ -1466,6 +2490,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int arrayidx = 0; @@ -1473,7 +2498,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { ScanKey cur = so->keyData + ikey; BTArrayKeyInfo *array = NULL; - int first_elem_dir; if (!(cur->sk_flags & SK_SEARCHARRAY) || cur->sk_strategy != BTEqualStrategyNumber) @@ -1485,16 +2509,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) continue; - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; + Assert(array->num_elems != -1); /* No skipping of non-required arrays */ - if (array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -1558,6 +2576,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) { ScanKey cur = so->keyData + ikey; + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; Datum tupdatum; bool tupnull; int32 result; @@ -1621,7 +2641,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, result = _bt_compare_array_skey(&so->orderProcs[ikey], tupdatum, tupnull, - cur->sk_argument, cur); + sk_argument, sk_isnull, + cur); /* * Does this comparison indicate that caller must _not_ advance the @@ -1954,18 +2975,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (beyond_end_advance) { - int final_elem_dir; - - if (ScanDirectionIsBackward(dir) || !array) - final_elem_dir = 0; - else - final_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != final_elem_dir) - { - array->cur_elem = final_elem_dir; - cur->sk_argument = array->elem_values[final_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); continue; } @@ -1990,18 +3002,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (!all_required_satisfied || cur->sk_attno > tupnatts) { - int first_elem_dir; - - if (ScanDirectionIsForward(dir) || !array) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); continue; } @@ -2019,15 +3022,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* * Binary search for closest match that's available from the array */ - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); + if (array->num_elems != -1) + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); - Assert(set_elem >= 0 && set_elem < array->num_elems); + /* + * Skip array. "Binary search" by checking if tupdatum/tupnull + * are within the low_value/high_value range of the skip array. + */ + else + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); } else { + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; + Assert(sktrig_required && required); /* @@ -2041,7 +3056,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ result = _bt_compare_array_skey(&so->orderProcs[ikey], tupdatum, tupnull, - cur->sk_argument, cur); + sk_argument, sk_isnull, cur); } /* @@ -2061,6 +3076,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * its final element is. Once outside the loop we'll then "increment * this array's set_elem" by calling _bt_advance_array_keys_increment. * That way the process rolls over to higher order arrays as needed. + * The skip array case will set the array's scan key to the final + * valid element for the current scan direction, which is equivalent + * (when we have a real set_elem "match" it's just the final element + * in the current scan direction). * * Under this scheme any required arrays only ever ratchet forwards * (or backwards), and always do so to the maximum possible extent @@ -2100,11 +3119,62 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } } - /* Advance array keys, even when set_elem isn't an exact match */ - if (array && array->cur_elem != set_elem) + /* Advance array keys, even when we don't have an exact match */ + + if (!array) + continue; /* no element to set in non-array */ + + /* Conventional arrays have a valid set_elem for us to advance to */ + if (array->num_elems != -1) { - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; + if (array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } + + continue; + } + + /* + * Conceptually, skip arrays also have array elements. The actual + * elements/values are generated procedurally and on demand. + */ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + Assert(required); + + if (result == 0) + { + /* + * Anything within the range of possible element values is treated + * as "a match for one of the array's elements". Store the next + * scan key argument value by taking a copy of the tupdatum value + * from caller's tuple (or set scan key IS NULL when tupnull, iff + * the array's range of possible elements covers NULL). + */ + _bt_scankey_set_element(rel, cur, array, tupdatum, tupnull); + } + else if (beyond_end_advance) + { + /* + * We need to set the array element to the final "element" in the + * current scan direction for "beyond end of array element" array + * advancement. See above for an explanation. + */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); + } + else + { + /* + * The closest matching element is the lowest element; even that + * still puts us ahead of caller's tuple in the key space. This + * process has to carry to any lower-order arrays. See above for + * an explanation. + */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -2460,10 +3530,12 @@ end_toplevel_scan: /* * _bt_preprocess_keys() -- Preprocess scan keys * + * The first call here (per btrescan) allocates so->keyData[]. * The given search-type keys (taken from scan->keyData[]) * are copied to so->keyData[] with possible transformation. * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets - * the number of output keys (possibly less, never greater). + * the number of output keys. Calling here a second time (during the same + * btrescan) is a no-op. * * The output keys are marked with additional sk_flags bits beyond the * system-standard bits supplied by the caller. The DESC and NULLS_FIRST @@ -2483,6 +3555,8 @@ end_toplevel_scan: * within each attribute may be done as a byproduct of the processing here. * That process must leave array scan keys (within an attribute) in the same * order as corresponding entries from the scan's BTArrayKeyInfo array info. + * We might also cons up skip array scan keys that weren't present in the + * original input keys; these are also output in standard attribute order. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -2550,9 +3624,7 @@ _bt_preprocess_keys(IndexScanDesc scan) int16 *indoption = scan->indexRelation->rd_indoption; int new_numberOfKeys; int numberOfEqualCols; - ScanKey inkeys; - ScanKey outkeys; - ScanKey cur; + ScanKey inputsk; BTScanKeyPreproc xform[BTMaxStrategyNumber]; bool test_result; int i, @@ -2584,7 +3656,7 @@ _bt_preprocess_keys(IndexScanDesc scan) return; /* done if qual-less scan */ /* If any keys are SK_SEARCHARRAY type, set up array-key info */ - arrayKeyData = _bt_preprocess_array_keys(scan); + arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys); if (!so->qual_ok) { /* unmatchable array, so give up */ @@ -2598,32 +3670,36 @@ _bt_preprocess_keys(IndexScanDesc scan) */ if (arrayKeyData) { - inkeys = arrayKeyData; + inputsk = arrayKeyData; /* Also maintain keyDataMap for remapping so->orderProc[] later */ keyDataMap = MemoryContextAlloc(so->arrayContext, numberOfKeys * sizeof(int)); } else - inkeys = scan->keyData; + inputsk = scan->keyData; + + /* + * Now that we have an estimate of the number of output scan keys + * (including any skip array scan keys), allocate space for them + */ + so->keyData = palloc(sizeof(ScanKeyData) * numberOfKeys); - outkeys = so->keyData; - cur = &inkeys[0]; /* we check that input keys are correctly ordered */ - if (cur->sk_attno < 1) + if (inputsk->sk_attno < 1) elog(ERROR, "btree index keys must be ordered by attribute"); /* We can short-circuit most of the work if there's just one key */ if (numberOfKeys == 1) { /* Apply indoption to scankey (might change sk_strategy!) */ - if (!_bt_fix_scankey_strategy(cur, indoption)) + if (!_bt_fix_scankey_strategy(inputsk, indoption)) so->qual_ok = false; - memcpy(outkeys, cur, sizeof(ScanKeyData)); + memcpy(so->keyData, inputsk, sizeof(ScanKeyData)); so->numberOfKeys = 1; /* We can mark the qual as required if it's for first index col */ - if (cur->sk_attno == 1) - _bt_mark_scankey_required(outkeys); + if (inputsk->sk_attno == 1) + _bt_mark_scankey_required(so->keyData); if (arrayKeyData) { /* @@ -2631,8 +3707,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * (we'll miss out on the single value array transformation, but * that's not nearly as important when there's only one scan key) */ - Assert(cur->sk_flags & SK_SEARCHARRAY); - Assert(cur->sk_strategy != BTEqualStrategyNumber || + Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY); + Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber || (so->arrayKeys[0].scan_key == 0 && OidIsValid(so->orderProcs[0].fn_oid))); } @@ -2660,12 +3736,12 @@ _bt_preprocess_keys(IndexScanDesc scan) * handle after-last-key processing. Actual exit from the loop is at the * "break" statement below. */ - for (i = 0;; cur++, i++) + for (i = 0;; inputsk++, i++) { if (i < numberOfKeys) { /* Apply indoption to scankey (might change sk_strategy!) */ - if (!_bt_fix_scankey_strategy(cur, indoption)) + if (!_bt_fix_scankey_strategy(inputsk, indoption)) { /* NULL can't be matched, so give up */ so->qual_ok = false; @@ -2677,12 +3753,12 @@ _bt_preprocess_keys(IndexScanDesc scan) * If we are at the end of the keys for a particular attr, finish up * processing and emit the cleaned-up keys. */ - if (i == numberOfKeys || cur->sk_attno != attno) + if (i == numberOfKeys || inputsk->sk_attno != attno) { int priorNumberOfEqualCols = numberOfEqualCols; /* check input keys are correctly ordered */ - if (i < numberOfKeys && cur->sk_attno < attno) + if (i < numberOfKeys && inputsk->sk_attno < attno) elog(ERROR, "btree index keys must be ordered by attribute"); /* @@ -2741,7 +3817,8 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); xform[j].skey = NULL; xform[j].ikey = -1; } @@ -2786,7 +3863,7 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* - * Emit the cleaned-up keys into the outkeys[] array, and then + * Emit the cleaned-up keys into the so->keyData[] array, and then * mark them if they are required. They are required (possibly * only in one direction) if all attrs before this one had "=". */ @@ -2794,7 +3871,7 @@ _bt_preprocess_keys(IndexScanDesc scan) { if (xform[j].skey) { - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); if (arrayKeyData) @@ -2811,19 +3888,19 @@ _bt_preprocess_keys(IndexScanDesc scan) break; /* Re-initialize for new attno */ - attno = cur->sk_attno; + attno = inputsk->sk_attno; memset(xform, 0, sizeof(xform)); } /* check strategy this key's operator corresponds to */ - j = cur->sk_strategy - 1; + j = inputsk->sk_strategy - 1; /* if row comparison, push it directly to the output array */ - if (cur->sk_flags & SK_ROW_HEADER) + if (inputsk->sk_flags & SK_ROW_HEADER) { - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; - memcpy(outkey, cur, sizeof(ScanKeyData)); + memcpy(outkey, inputsk, sizeof(ScanKeyData)); if (arrayKeyData) keyDataMap[new_numberOfKeys - 1] = i; if (numberOfEqualCols == attno - 1) @@ -2837,19 +3914,8 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } - /* - * Does this input scan key require further processing as an array? - */ - if (cur->sk_strategy == InvalidStrategy) - { - /* _bt_preprocess_array_keys marked this array key redundant */ - Assert(arrayKeyData); - Assert(cur->sk_flags & SK_SEARCHARRAY); - continue; - } - - if (cur->sk_strategy == BTEqualStrategyNumber && - (cur->sk_flags & SK_SEARCHARRAY)) + if (inputsk->sk_strategy == BTEqualStrategyNumber && + (inputsk->sk_flags & SK_SEARCHARRAY)) { /* _bt_preprocess_array_keys kept this array key */ Assert(arrayKeyData); @@ -2863,7 +3929,7 @@ _bt_preprocess_keys(IndexScanDesc scan) if (xform[j].skey == NULL) { /* nope, so this scan key wins by default (at least for now) */ - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -2881,7 +3947,7 @@ _bt_preprocess_keys(IndexScanDesc scan) /* * Have to set up array keys */ - if ((cur->sk_flags & SK_SEARCHARRAY)) + if ((inputsk->sk_flags & SK_SEARCHARRAY)) { array = &so->arrayKeys[arrayidx - 1]; orderproc = so->orderProcs + i; @@ -2909,13 +3975,15 @@ _bt_preprocess_keys(IndexScanDesc scan) */ } - if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey, - array, orderproc, &test_result)) + if (_bt_compare_scankey_args(scan, inputsk, inputsk, + xform[j].skey, array, orderproc, + &test_result)) { /* Have all we need to determine redundancy */ if (test_result) { - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); /* * New key is more restrictive, and so replaces old key... @@ -2923,7 +3991,7 @@ _bt_preprocess_keys(IndexScanDesc scan) if (j != (BTEqualStrategyNumber - 1) || !(xform[j].skey->sk_flags & SK_SEARCHARRAY)) { - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -2936,7 +4004,7 @@ _bt_preprocess_keys(IndexScanDesc scan) * scan key. _bt_compare_scankey_args expects us to * always keep arrays (and discard non-arrays). */ - Assert(!(cur->sk_flags & SK_SEARCHARRAY)); + Assert(!(inputsk->sk_flags & SK_SEARCHARRAY)); } } else if (j == (BTEqualStrategyNumber - 1)) @@ -2959,14 +4027,14 @@ _bt_preprocess_keys(IndexScanDesc scan) * even with incomplete opfamilies. _bt_advance_array_keys * depends on this. */ - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); if (arrayKeyData) keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -3057,10 +4125,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->scan_key != ikey) return false; - if (array->num_elems <= 0) + if (array->num_elems == 0 || array->num_elems < -1) return false; - if (cur->sk_argument != array->elem_values[array->cur_elem]) + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) return false; if (last_sk_attno > cur->sk_attno) return false; @@ -3135,6 +4204,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool leftnull, rightnull; + /* Handle skip array comparison with IS NOT NULL scan key */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP) + { + /* Shouldn't generate skip array in presence of IS NULL key */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL)); + Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL); + + /* Don't allow skip array to generate IS NULL scan key/element */ + Assert(array->num_elems == -1); + array->null_elem = false; + + /* IS NOT NULL key (could be leftarg or rightarg) now redundant */ + *result = true; + return true; + } + if (leftarg->sk_flags & SK_ISNULL) { Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -3208,6 +4293,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, { /* Can't make the comparison */ *result = false; /* suppress compiler warnings */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); return false; } @@ -3380,13 +4466,6 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) return true; } - if (skey->sk_strategy == InvalidStrategy) - { - /* Already-eliminated array scan key; don't need to fix anything */ - Assert(skey->sk_flags & SK_SEARCHARRAY); - return true; - } - /* Adjust strategy for DESC, if we didn't already */ if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC)) skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy); diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index e9d4cd60d..96d0d9185 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -114,6 +114,10 @@ btvalidate(Oid opclassoid) case BTOPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case BTSKIPSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index b8b5c147c..a86dbf71b 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -1330,6 +1330,31 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("btree equal image functions must not be cross-type"))); } + else if (member->number == BTSKIPSUPPORT_PROC) + { + if (procform->pronargs != 1 || + procform->proargtypes.values[0] != INTERNALOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must accept type \"internal\""))); + if (procform->prorettype != VOIDOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must return void"))); + + /* + * pg_amproc functions are indexed by (lefttype, righttype), but a + * skip support function doesn't make sense in cross-type + * scenarios. The same opclass opcintype OID is always used for + * lefttype and righttype. Providing a cross-type routine isn't + * sensible. Reject cross-type ALTER OPERATOR FAMILY ... ADD + * FUNCTION 6 statements here. + */ + if (member->lefttype != member->righttype) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must not be cross-type"))); + } } else if (amoid == HASH_AM_OID) { diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index edb09d4e3..e945686c8 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -96,6 +96,7 @@ OBJS = \ rowtypes.o \ ruleutils.o \ selfuncs.o \ + skipsupport.o \ tid.o \ timestamp.o \ trigfuncs.o \ diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 9c854e0e5..ea3d0f4b5 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -34,6 +34,7 @@ #include "utils/date.h" #include "utils/datetime.h" #include "utils/numeric.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" /* @@ -455,6 +456,39 @@ date_sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +date_decrement(Relation rel, Datum existing) +{ + DateADT dexisting = DatumGetDateADT(existing); + + Assert(dexisting > DATEVAL_NOBEGIN); + + return DateADTGetDatum(dexisting - 1); +} + +static Datum +date_increment(Relation rel, Datum existing) +{ + DateADT dexisting = DatumGetDateADT(existing); + + Assert(dexisting < DATEVAL_NOEND); + + return DateADTGetDatum(dexisting + 1); +} + +Datum +date_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = date_decrement; + sksup->increment = date_increment; + sksup->low_elem = DateADTGetDatum(DATEVAL_NOBEGIN); + sksup->high_elem = DateADTGetDatum(DATEVAL_NOEND); + + PG_RETURN_VOID(); +} + Datum date_finite(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 8c6fc80c3..91682edd5 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -83,6 +83,7 @@ backend_sources += files( 'rowtypes.c', 'ruleutils.c', 'selfuncs.c', + 'skipsupport.c', 'tid.c', 'timestamp.c', 'trigfuncs.c', diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5f5d7959d..c1df7be9f 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6800,6 +6800,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, List *indexBoundQuals; int indexcol; bool eqQualHere; + bool found_skip; bool found_saop; bool found_is_null_op; double num_sa_scans; @@ -6825,6 +6826,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, indexBoundQuals = NIL; indexcol = 0; eqQualHere = false; + found_skip = false; found_saop = false; found_is_null_op = false; num_sa_scans = 1; @@ -6833,15 +6835,38 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, IndexClause *iclause = lfirst_node(IndexClause, lc); ListCell *lc2; + /* + * XXX For now we just cost skip scans via generic rules: make a + * uniform assumption that there will be 10 primitive index scans per + * skipped attribute, relying on the "1/3 of all index pages" cap that + * this costing has used since Postgres 17. Also assume that skipping + * won't take place for an index that has fewer than 100 pages. + * + * The current approach to costing leaves much to be desired, but is + * at least better than nothing at all (keeping the code as it is on + * HEAD just makes testing and review inconvenient). + */ if (indexcol != iclause->indexcol) { /* Beginning of a new column's quals */ if (!eqQualHere) - break; /* done if no '=' qual for indexcol */ + { + found_skip = true; /* skip when no '=' qual for indexcol */ + if (index->pages < 100) + break; + num_sa_scans += 10; + } eqQualHere = false; indexcol++; if (indexcol != iclause->indexcol) - break; /* no quals at all for indexcol */ + { + /* no quals at all for indexcol */ + found_skip = true; + if (index->pages < 100) + break; + num_sa_scans += 10 * (indexcol - iclause->indexcol); + continue; + } } /* Examine each indexqual associated with this index clause */ @@ -6914,6 +6939,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, if (index->unique && indexcol == index->nkeycolumns - 1 && eqQualHere && + !found_skip && !found_saop && !found_is_null_op) numIndexTuples = 1.0; diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c new file mode 100644 index 000000000..9665e4985 --- /dev/null +++ b/src/backend/utils/adt/skipsupport.c @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.c + * Support routines for B-Tree skip scans. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/skipsupport.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/nbtree.h" +#include "utils/lsyscache.h" +#include "utils/skipsupport.h" + +/* + * Fill in SkipSupport given an operator class (opfamily + opcintype). + * + * On success, returns true, and initializes all SkipSupport fields for + * caller. Otherwise returns false, indicating that operator class has no + * skip support function. + */ +bool +PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse, + SkipSupport sksup) +{ + Oid skipSupportFunction; + + /* Look for a skip support function */ + skipSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype, + BTSKIPSUPPORT_PROC); + if (!OidIsValid(skipSupportFunction)) + return false; + + OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup)); + + if (reverse) + { + Datum low_elem = sksup->low_elem; + + sksup->low_elem = sksup->high_elem; + sksup->high_elem = low_elem; + } + + return true; +} diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 45eb1b2fe..a9222f896 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -13,12 +13,15 @@ #include "postgres.h" +#include + #include "common/hashfn.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" #include "utils/guc.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" #include "utils/uuid.h" @@ -390,6 +393,68 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) return res; } +static Datum +uuid_decrement(Relation rel, Datum existing) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] > 0) + { + uuid->data[i]--; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = UCHAR_MAX; + } + + Assert(false); + + return UUIDPGetDatum(uuid); +} + +static Datum +uuid_increment(Relation rel, Datum existing) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] < UCHAR_MAX) + { + uuid->data[i]++; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = 0; + } + + Assert(false); + + return UUIDPGetDatum(uuid); +} + +Datum +uuid_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + pg_uuid_t *uuid_min = palloc(UUID_LEN); + pg_uuid_t *uuid_max = palloc(UUID_LEN); + + memset(uuid_min->data, 0x00, UUID_LEN); + memset(uuid_max->data, 0xFF, UUID_LEN); + + sksup->decrement = uuid_decrement; + sksup->increment = uuid_increment; + sksup->low_elem = UUIDPGetDatum(uuid_min); + sksup->high_elem = UUIDPGetDatum(uuid_max); + + PG_RETURN_VOID(); +} + /* hash index support */ Datum uuid_hash(PG_FUNCTION_ARGS) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 6f4188599..8ec3f150a 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/slru.h" #include "access/toast_compression.h" #include "access/twophase.h" @@ -3523,6 +3524,17 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* XXX Remove before commit */ + { + {"skipscan_prefix_cols", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_prefix_cols, + INDEX_MAX_KEYS, 0, INDEX_MAX_KEYS, + NULL, NULL, NULL + }, + { /* Can't be set in postgresql.conf */ {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 2b3997988..9662fb2ba 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -583,6 +583,19 @@ options(relopts local_relopts *) returns + + skipsupport + + + Optionally, a btree operator family may provide a skip + support function, registered under support function + number 6. These functions allow the B-tree code to more efficiently + navigate the index structure via an index skip scan. The + APIs involved in this are defined in + src/include/utils/skipsupport.h. + + + diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 22d8ad1aa..f17dd3456 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -461,6 +461,13 @@ 5 + + + Return the addresses of C-callable skip support function(s) + (optional) + + 6 + @@ -1056,7 +1063,8 @@ DEFAULT FOR TYPE int8 USING btree FAMILY integer_ops AS FUNCTION 1 btint8cmp(int8, int8) , FUNCTION 2 btint8sortsupport(internal) , FUNCTION 3 in_range(int8, int8, int8, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint8skipsupport(internal); CREATE OPERATOR CLASS int4_ops DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS @@ -1069,7 +1077,8 @@ DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS FUNCTION 1 btint4cmp(int4, int4) , FUNCTION 2 btint4sortsupport(internal) , FUNCTION 3 in_range(int4, int4, int4, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint4skipsupport(internal); CREATE OPERATOR CLASS int2_ops DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS @@ -1082,7 +1091,8 @@ DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS FUNCTION 1 btint2cmp(int2, int2) , FUNCTION 2 btint2sortsupport(internal) , FUNCTION 3 in_range(int2, int2, int2, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint2skipsupport(internal); ALTER OPERATOR FAMILY integer_ops USING btree ADD -- cross-type comparisons int8 vs int2 diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index ae54cb254..8b6b775c1 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -362,9 +362,9 @@ ERROR: invalid operator number 0, must be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ERROR: operator argument types must be specified in ALTER OPERATOR FAMILY ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ERROR: invalid function number 0, must be between 1 and 5 -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 -ERROR: invalid function number 6, must be between 1 and 5 +ERROR: invalid function number 0, must be between 1 and 6 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 +ERROR: invalid function number 7, must be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 3bbe4c5f9..a8d5be6c1 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -5138,9 +5138,10 @@ List of access methods btree | uuid_ops | uuid | uuid | 1 | uuid_cmp btree | uuid_ops | uuid | uuid | 2 | uuid_sortsupport btree | uuid_ops | uuid | uuid | 4 | btequalimage + btree | uuid_ops | uuid | uuid | 6 | uuid_skipsupport hash | uuid_ops | uuid | uuid | 1 | uuid_hash hash | uuid_ops | uuid | uuid | 2 | uuid_hash_extended -(5 rows) +(6 rows) -- check \dconfig set work_mem = 10240; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d..4246afefd 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -310,7 +310,7 @@ ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 6 < (int4, int2); -- ope ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 0 < (int4, int2); -- operator number should be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e6c1caf64..369eca3a4 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -218,6 +218,7 @@ BTScanPos BTScanPosData BTScanPosItem BTShared +BTSkipPreproc BTSortArrayContext BTSpool BTStack @@ -2650,6 +2651,8 @@ SingleBoundSortItem SinglePartitionSpec Size SkipPages +SkipSupport +SkipSupportData SlabBlock SlabContext SlabSlot -- 2.45.2