From 35672d7b6a8fa7e78341d7f6580474693a6afd7d Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 16 Apr 2024 13:21:36 -0400 Subject: [PATCH v3] Add skip scan to nbtree. Skip scan allows nbtree index scans to efficiently use a composite index on an index (a, b) for queries with a predicate such as "WHERE b = 5". This is useful in cases where the total number of distinct values in the column 'a' is reasonably small (think hundreds, possibly thousands). In effect, a skip scan treats the composite index on (a, b) as if it was a series of disjunct subindexes -- one subindex per distinct 'a' value. We exhaustively "search every subindex" using a qual that behaves like "WHERE a = ANY() AND b = 5". The design of skip scan works by extended the design for arrays established by commit 5bf748b8. "Skip arrays" generate their array values procedurally and on-demand, but otherwise work just like arrays used by SAOPs. B-Tree operator classes on discrete types can now optionally provide a skip support routine. This is used to generate the next array element value by incrementing the current value (or by decrementing, in the case of backwards scans). When the opclass lacks a skip support routine, we use sentinel next-key values instead. Adding skip support makes skip scans more efficient in cases where there is naturally a good chance that the very next value will find matching tuples. For example, during an index scan with a leading "sales_date" attribute, there is a decent chance that a scan that just finished returning tuples matching "sales_date = '2024-06-01' and id = 5000" will find later tuples matching "sales_date = '2024-06-02' and id = 5000". It is to our advantage to skip straight to the relevant "id = 5000" leaf page, totally avoiding reading earlier "sales_date = '2024-06-02'" leaf pages. Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Discussion: https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP+G4bw@mail.gmail.com --- src/include/access/nbtree.h | 24 +- src/include/catalog/pg_amproc.dat | 16 + src/include/catalog/pg_proc.dat | 24 + src/include/utils/skipsupport.h | 124 ++ src/backend/access/nbtree/nbtcompare.c | 201 +++ src/backend/access/nbtree/nbtree.c | 10 +- src/backend/access/nbtree/nbtsearch.c | 130 +- src/backend/access/nbtree/nbtutils.c | 1666 +++++++++++++++++-- src/backend/access/nbtree/nbtvalidate.c | 4 + src/backend/commands/opclasscmds.c | 25 + src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/date.c | 34 + src/backend/utils/adt/meson.build | 1 + src/backend/utils/adt/selfuncs.c | 30 +- src/backend/utils/adt/skipsupport.c | 54 + src/backend/utils/adt/uuid.c | 65 + src/backend/utils/misc/guc_tables.c | 23 + doc/src/sgml/btree.sgml | 13 + doc/src/sgml/xindex.sgml | 16 +- src/test/regress/expected/alter_generic.out | 6 +- src/test/regress/expected/psql.out | 3 +- src/test/regress/sql/alter_generic.sql | 2 +- src/tools/pgindent/typedefs.list | 3 + 23 files changed, 2293 insertions(+), 182 deletions(-) create mode 100644 src/include/utils/skipsupport.h create mode 100644 src/backend/utils/adt/skipsupport.c diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 749304334..7cd5902cf 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,7 @@ #include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/shm_toc.h" +#include "utils/skipsupport.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -709,7 +710,8 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup) #define BTINRANGE_PROC 3 #define BTEQUALIMAGE_PROC 4 #define BTOPTIONS_PROC 5 -#define BTNProcs 5 +#define BTSKIPSUPPORT_PROC 6 +#define BTNProcs 6 /* * We need to be able to tell the difference between read and write @@ -1032,9 +1034,18 @@ typedef BTScanPosData *BTScanPos; typedef struct BTArrayKeyInfo { int scan_key; /* index of associated key in keyData */ + int num_elems; /* number of elems (-1 for skip array) */ + + /* State used by standard arrays that store elements in memory */ int cur_elem; /* index of current element in elem_values */ - int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ + + /* State used by skip arrays, which generate elements procedurally */ + bool use_sksup; /* sksup set to valid routine? */ + bool null_elem; /* lowest/highest element actually NULL? */ + SkipSupportData sksup; /* opclass skip scan support, when use_sksup */ + ScanKey low_compare; /* !use_sksup > and >= lower bound */ + ScanKey high_compare; /* !use_sksup < and <= upper bound */ } BTArrayKeyInfo; typedef struct BTScanOpaqueData @@ -1123,6 +1134,11 @@ typedef struct BTReadPageState */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_SKIP 0x00040000 /* SK_SEARCHARRAY skip scan key */ +#define SK_BT_NEG_INF 0x00080000 /* -inf search SK_SEARCHARRAY */ +#define SK_BT_POS_INF 0x00100000 /* +inf search SK_SEARCHARRAY */ +#define SK_BT_NEXTKEY 0x00200000 /* key after sk_argument */ +#define SK_BT_PREVKEY 0x00400000 /* key before sk_argument */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1159,6 +1175,10 @@ typedef struct BTOptions #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 +/* GUC parameters (just a temporary convenience for reviewers) */ +extern PGDLLIMPORT int skipscan_prefix_cols; +extern PGDLLIMPORT bool skipscan_skipsupport_enabled; + /* * external entry points for btree, in nbtree.c */ diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index f639c3a6a..2a8f6f3f1 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -21,6 +21,8 @@ amprocrighttype => 'bit', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '1', amproc => 'btboolcmp' }, +{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', + amprocrighttype => 'bool', amprocnum => '6', amproc => 'btboolskipsupport' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bpchar_ops', amproclefttype => 'bpchar', @@ -41,12 +43,16 @@ amprocrighttype => 'char', amprocnum => '1', amproc => 'btcharcmp' }, { amprocfamily => 'btree/char_ops', amproclefttype => 'char', amprocrighttype => 'char', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/char_ops', amproclefttype => 'char', + amprocrighttype => 'char', amprocnum => '6', amproc => 'btcharskipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '1', amproc => 'date_cmp' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '2', amproc => 'date_sortsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', + amprocrighttype => 'date', amprocnum => '6', amproc => 'date_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'timestamp', amprocnum => '1', amproc => 'date_cmp_timestamp' }, @@ -122,6 +128,8 @@ amprocrighttype => 'int2', amprocnum => '2', amproc => 'btint2sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', + amprocrighttype => 'int2', amprocnum => '6', amproc => 'btint2skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint24cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', @@ -141,6 +149,8 @@ amprocrighttype => 'int4', amprocnum => '2', amproc => 'btint4sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', + amprocrighttype => 'int4', amprocnum => '6', amproc => 'btint4skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int8', amprocnum => '1', amproc => 'btint48cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', @@ -160,6 +170,8 @@ amprocrighttype => 'int8', amprocnum => '2', amproc => 'btint8sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int8', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', + amprocrighttype => 'int8', amprocnum => '6', amproc => 'btint8skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint84cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', @@ -193,6 +205,8 @@ amprocrighttype => 'oid', amprocnum => '2', amproc => 'btoidsortsupport' }, { amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', amprocrighttype => 'oid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', + amprocrighttype => 'oid', amprocnum => '6', amproc => 'btoidskipsupport' }, { amprocfamily => 'btree/oidvector_ops', amproclefttype => 'oidvector', amprocrighttype => 'oidvector', amprocnum => '1', amproc => 'btoidvectorcmp' }, @@ -261,6 +275,8 @@ amprocrighttype => 'uuid', amprocnum => '2', amproc => 'uuid_sortsupport' }, { amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', amprocrighttype => 'uuid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', + amprocrighttype => 'uuid', amprocnum => '6', amproc => 'uuid_skipsupport' }, { amprocfamily => 'btree/record_ops', amproclefttype => 'record', amprocrighttype => 'record', amprocnum => '1', amproc => 'btrecordcmp' }, { amprocfamily => 'btree/record_image_ops', amproclefttype => 'record', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 73d9cf858..27921e0df 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1004,18 +1004,27 @@ { oid => '3129', descr => 'sort support', proname => 'btint2sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2sortsupport' }, +{ oid => '9290', descr => 'skip support', + proname => 'btint2skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint2skipsupport' }, { oid => '351', descr => 'less-equal-greater', proname => 'btint4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int4 int4', prosrc => 'btint4cmp' }, { oid => '3130', descr => 'sort support', proname => 'btint4sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4sortsupport' }, +{ oid => '9291', descr => 'skip support', + proname => 'btint4skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint4skipsupport' }, { oid => '842', descr => 'less-equal-greater', proname => 'btint8cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int8 int8', prosrc => 'btint8cmp' }, { oid => '3131', descr => 'sort support', proname => 'btint8sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8sortsupport' }, +{ oid => '9292', descr => 'skip support', + proname => 'btint8skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint8skipsupport' }, { oid => '354', descr => 'less-equal-greater', proname => 'btfloat4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'float4 float4', prosrc => 'btfloat4cmp' }, @@ -1034,12 +1043,18 @@ { oid => '3134', descr => 'sort support', proname => 'btoidsortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidsortsupport' }, +{ oid => '9293', descr => 'skip support', + proname => 'btoidskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btoidskipsupport' }, { oid => '404', descr => 'less-equal-greater', proname => 'btoidvectorcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'oidvector oidvector', prosrc => 'btoidvectorcmp' }, { oid => '358', descr => 'less-equal-greater', proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'char char', prosrc => 'btcharcmp' }, +{ oid => '9294', descr => 'skip support', + proname => 'btcharskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btcharskipsupport' }, { oid => '359', descr => 'less-equal-greater', proname => 'btnamecmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'name name', prosrc => 'btnamecmp' }, @@ -2214,6 +2229,9 @@ { oid => '3136', descr => 'sort support', proname => 'date_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_sortsupport' }, +{ oid => '9295', descr => 'skip support', + proname => 'date_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'date_skipsupport' }, { oid => '4133', descr => 'window RANGE support', proname => 'in_range', prorettype => 'bool', proargtypes => 'date date interval bool bool', @@ -4368,6 +4386,9 @@ { oid => '1693', descr => 'less-equal-greater', proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'bool bool', prosrc => 'btboolcmp' }, +{ oid => '9296', descr => 'skip support', + proname => 'btboolskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btboolskipsupport' }, { oid => '1688', descr => 'hash', proname => 'time_hash', prorettype => 'int4', proargtypes => 'time', @@ -9192,6 +9213,9 @@ { oid => '3300', descr => 'sort support', proname => 'uuid_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_sortsupport' }, +{ oid => '9297', descr => 'skip support', + proname => 'uuid_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'uuid_skipsupport' }, { oid => '2961', descr => 'I/O', proname => 'uuid_recv', prorettype => 'uuid', proargtypes => 'internal', prosrc => 'uuid_recv' }, diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h new file mode 100644 index 000000000..ab79acb8c --- /dev/null +++ b/src/include/utils/skipsupport.h @@ -0,0 +1,124 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.h + * Support routines for B-Tree skip scans. + * + * B-Tree operator classes for discrete types can optionally provide a support + * function for skipping. This is used during skip scans. + * + * A B-tree operator class that implements skip support provides B-tree index + * scans with a way of enumerating and iterating through every possible value + * from the domain of indexable values. This gives scans a way to determine + * the next value in line for a given skip array/scan key/skipped attribute. + * This happens at the point where the scan determines that another primitive + * index scan is required. The next value is used (in combination with at + * least one additional lower-order non-skip key, taken from the SQL query) to + * relocate the scan, skipping over many irrelevant leaf pages in the process. + * + * There are many data types/opclasses where implementing a skip support + * scheme is inherently impossible (or at least impractical). Obviously, it + * would be wrong if the "next" value generated by an opclass was actually + * after the true next value (any index tuples with the true next value would + * be overlooked by the index scan). + * + * Skip scan generally works best with discrete types such as integer, date, + * and boolean: types where there is a decent chance that indexes will contain + * contiguous values (in respect of the leading/skipped index attribute). + * When gaps/discontinuities are naturally rare (e.g., a leading identity + * column in a composite index, a date column preceding a product_id column), + * then it makes sense for the skip scan to optimistically assume that the + * next distinct indexable value will find directly matching index tuples. + * The B-Tree code can fall back on next-key probes for any opclass that + * doesn't include a skip support function, but it's a good idea to provide + * skip support for types that are likely to see benefits. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/skipsupport.h + * + *------------------------------------------------------------------------- + */ +#ifndef SKIPSUPPORT_H +#define SKIPSUPPORT_H + +#include "utils/relcache.h" + +typedef struct SkipSupportData *SkipSupport; + +/* + * State/callbacks used by skip arrays to procedurally generate elements. + * + * A BTSKIPSUPPORT_PROC function must set each and every field when called. + * If an opclass can only set some of the fields, then it cannot safely + * provide a skip support routine (and so must rely on the fallback strategy + * used by continuous types, such as numeric). + */ +typedef struct SkipSupportData +{ + /* + * low_elem and high_elem must be set with the lowest and highest possible + * values from the domain of indexable values (assuming standard ascending + * order). This helps the B-Tree code with finding its initial position + * at the leaf level (during the skip scan's first primitive index scan). + * In other words, it gives the B-Tree code a useful value to start from, + * before any data has been read from the index. + * + * low_elem and high_elem can also be used to prove that a qual is + * unsatisfiable in certain cross-type scenarios. + * + * low_elem and high_elem are also used by skip scans to determine when + * they've reached the final possible value (in the current direction). + * It's typical for the scan to run out of leaf pages before it runs out + * of unscanned indexable values, but it's still useful for the scan to + * have a way to recognize when it has reached the last possible value + * (this saves us a useless probe that just lands on the final leaf page). + * + * Note: the logic for determining that the scan has reached the final + * possible value naturally belongs in the B-Tree code. The final value + * isn't necessarily the original high_elem/low_elem set by the opclass. + * In particular, it'll be a lower/higher value when B-Tree preprocessing + * determines that the true range of possible values should be restricted, + * due to the presence of an inequality applied to the index's skipped + * attribute. These are range skip scans. + */ + Datum low_elem; /* lowest sorting/leftmost non-NULL value */ + Datum high_elem; /* highest sorting/rightmost non-NULL value */ + + /* + * Decrement/increment functions. + * + * Returns a decremented/incremented copy of caller's existing datum, + * allocated in caller's memory context (in the case of pass-by-reference + * types). It's not okay for these functions to leak any memory. + * + * Both decrement and increment callbacks are guaranteed to never be + * called with a NULL "existing" arg. (In general it is the B-Tree code's + * job to worry about NULLs, and about whether indexed values are stored + * in ASC order or DESC order.) + * + * The decrement callback is guaranteed to only be called with an + * "existing" value that's strictly > the low_elem set by the opclass. + * Similarly, the increment callback is guaranteed to only be called with + * an "existing" value that's strictly < the high_elem set by the opclass. + * Consequently, opclasses don't have to deal with "overflow" themselves + * (though asserting that the B-Tree code got it right is a good idea). + * + * It's quite possible (and very common) for the B-Tree skip scan caller's + * "existing" datum to just be a straight copy of a value that it copied + * from the index. Operator classes must be liberal in accepting every + * possible representational variation within the underlying data type. + * Opclasses don't have to preserve whatever semantically insignificant + * information the data type might be carrying around, though. + * + * Note: < and > are defined by the opclass's ORDER proc in the usual way. + */ + Datum (*decrement) (Relation rel, Datum existing); + Datum (*increment) (Relation rel, Datum existing); +} SkipSupportData; + +extern bool PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, + bool reverse, SkipSupport sksup); + +#endif /* SKIPSUPPORT_H */ diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 1c72867c8..48a877613 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -58,6 +58,7 @@ #include #include "utils/fmgrprotos.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #ifdef STRESS_SORT_INT_MIN @@ -78,6 +79,39 @@ btboolcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32((int32) a - (int32) b); } +static Datum +bool_decrement(Relation rel, Datum existing) +{ + bool bexisting = DatumGetBool(existing); + + Assert(bexisting == true); + + return BoolGetDatum(bexisting - 1); +} + +static Datum +bool_increment(Relation rel, Datum existing) +{ + bool bexisting = DatumGetBool(existing); + + Assert(bexisting == false); + + return BoolGetDatum(bexisting + 1); +} + +Datum +btboolskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = bool_decrement; + sksup->increment = bool_increment; + sksup->low_elem = BoolGetDatum(false); + sksup->high_elem = BoolGetDatum(true); + + PG_RETURN_VOID(); +} + Datum btint2cmp(PG_FUNCTION_ARGS) { @@ -105,6 +139,39 @@ btint2sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int2_decrement(Relation rel, Datum existing) +{ + int16 iexisting = DatumGetInt16(existing); + + Assert(iexisting > PG_INT16_MIN); + + return Int16GetDatum(iexisting - 1); +} + +static Datum +int2_increment(Relation rel, Datum existing) +{ + int16 iexisting = DatumGetInt16(existing); + + Assert(iexisting < PG_INT16_MAX); + + return Int16GetDatum(iexisting + 1); +} + +Datum +btint2skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int2_decrement; + sksup->increment = int2_increment; + sksup->low_elem = Int16GetDatum(PG_INT16_MIN); + sksup->high_elem = Int16GetDatum(PG_INT16_MAX); + + PG_RETURN_VOID(); +} + Datum btint4cmp(PG_FUNCTION_ARGS) { @@ -128,6 +195,39 @@ btint4sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int4_decrement(Relation rel, Datum existing) +{ + int32 iexisting = DatumGetInt32(existing); + + Assert(iexisting > PG_INT32_MIN); + + return Int32GetDatum(iexisting - 1); +} + +static Datum +int4_increment(Relation rel, Datum existing) +{ + int32 iexisting = DatumGetInt32(existing); + + Assert(iexisting < PG_INT32_MAX); + + return Int32GetDatum(iexisting + 1); +} + +Datum +btint4skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int4_decrement; + sksup->increment = int4_increment; + sksup->low_elem = Int32GetDatum(PG_INT32_MIN); + sksup->high_elem = Int32GetDatum(PG_INT32_MAX); + + PG_RETURN_VOID(); +} + Datum btint8cmp(PG_FUNCTION_ARGS) { @@ -171,6 +271,39 @@ btint8sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int8_decrement(Relation rel, Datum existing) +{ + int64 iexisting = DatumGetInt64(existing); + + Assert(iexisting > PG_INT64_MIN); + + return Int64GetDatum(iexisting - 1); +} + +static Datum +int8_increment(Relation rel, Datum existing) +{ + int64 iexisting = DatumGetInt64(existing); + + Assert(iexisting < PG_INT64_MAX); + + return Int64GetDatum(iexisting + 1); +} + +Datum +btint8skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int8_decrement; + sksup->increment = int8_increment; + sksup->low_elem = Int64GetDatum(PG_INT64_MIN); + sksup->high_elem = Int64GetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum btint48cmp(PG_FUNCTION_ARGS) { @@ -292,6 +425,39 @@ btoidsortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +oid_decrement(Relation rel, Datum existing) +{ + Oid oexisting = DatumGetObjectId(existing); + + Assert(oexisting > InvalidOid); + + return ObjectIdGetDatum(oexisting - 1); +} + +static Datum +oid_increment(Relation rel, Datum existing) +{ + Oid oexisting = DatumGetObjectId(existing); + + Assert(oexisting < OID_MAX); + + return ObjectIdGetDatum(oexisting + 1); +} + +Datum +btoidskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = oid_decrement; + sksup->increment = oid_increment; + sksup->low_elem = ObjectIdGetDatum(InvalidOid); + sksup->high_elem = ObjectIdGetDatum(OID_MAX); + + PG_RETURN_VOID(); +} + Datum btoidvectorcmp(PG_FUNCTION_ARGS) { @@ -325,3 +491,38 @@ btcharcmp(PG_FUNCTION_ARGS) /* Be careful to compare chars as unsigned */ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); } + +static Datum +char_decrement(Relation rel, Datum existing) +{ + uint8 cexisting = UInt8GetDatum(existing); + + Assert(cexisting > 0); + + return CharGetDatum((uint8) cexisting - 1); +} + +static Datum +char_increment(Relation rel, Datum existing) +{ + uint8 cexisting = UInt8GetDatum(existing); + + Assert(cexisting < UCHAR_MAX); + + return CharGetDatum((uint8) cexisting + 1); +} + +Datum +btcharskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = char_decrement; + sksup->increment = char_increment; + + /* btcharcmp compares chars as unsigned */ + sksup->low_elem = UInt8GetDatum(0); + sksup->high_elem = UInt8GetDatum(UCHAR_MAX); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 686a3206f..9c9cd48f7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -324,11 +324,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); BTScanPosInvalidate(so->currPos); BTScanPosInvalidate(so->markPos); - if (scan->numberOfKeys > 0) - so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); - else - so->keyData = NULL; + so->keyData = NULL; so->needPrimScan = false; so->scanBehind = false; so->arrayKeys = NULL; @@ -408,6 +405,11 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ so->numArrayKeys = 0; /* ditto */ + + /* Release private storage allocated in previous btrescan, if any */ + if (so->keyData != NULL) + pfree(so->keyData); + so->keyData = NULL; } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 57bcfc7e4..f1bb4e8ee 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -880,7 +880,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Buffer buf; BTStack stack; OffsetNumber offnum; - StrategyNumber strat; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; @@ -1022,6 +1021,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKey chosen; ScanKey impliesNN; ScanKey cur; + int ikey = 0, + ichosen = 0; /* * chosen is the so-far-chosen key for the current attribute, if any. @@ -1042,6 +1043,96 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* + * Conceptually, skip arrays consist of array elements whose + * values are generated procedurally and on demand. We need + * special handling for that here. + * + * We must interpret various sentinel values to generate an + * insertion scan key. This is only actually needed for index + * attributes whose input opclass lacks a skip support routine + * (when skip support is available we'll always be able to + * generate true array element datum values instead). + */ + if (chosen && chosen->sk_flags & (SK_BT_NEG_INF | SK_BT_POS_INF)) + { + BTArrayKeyInfo *array = NULL; + + Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(!(chosen->sk_flags & (SK_BT_NEXTKEY | SK_BT_PREVKEY))); + + for (; ikey < so->numArrayKeys; ikey++) + { + array = &so->arrayKeys[ikey]; + if (array->scan_key == ichosen) + break; + } + + Assert(array->scan_key == ichosen); + Assert(array->num_elems == -1); + Assert(!array->use_sksup); + + if (array->null_elem) + { + /* + * Treat the chosen scan key as having the value -inf + * (or the value +inf, in the backwards scan case) by + * not appending it to the local startKeys[] array. + * + * Note: we expect one or more lower-order required + * keys that won't influence initial positioning (for + * this primitive index scan). There cannot possibly + * be non-pivot tuples that have values matching -inf, + * though, so this "omission" can have no real impact. + * + * Note: This array has a NULL element, which means + * that there must be no upper/lower inequalities. + * Assert that prepprocessing got this right. + */ + Assert(!array->low_compare); + Assert(!array->high_compare); + break; /* done adding entries to startKeys[] */ + } + else if ((chosen->sk_flags & SK_BT_NEG_INF) && + array->low_compare) + { + Assert(ScanDirectionIsForward(dir)); + + /* use array's inequality key in startKeys[] */ + chosen = array->low_compare; + } + else if ((chosen->sk_flags & SK_BT_POS_INF) && + array->high_compare) + { + Assert(ScanDirectionIsBackward(dir)); + + /* use array's inequality key in startKeys[] */ + chosen = array->high_compare; + } + else + { + /* + * Array starts at (or ends just before) any non-NULL + * values. Deduce a NOT NULL key to skip over NULLs. + * + * Note: range skip arrays generated using an explicit + * IS NOT NULL input scan key against an otherwise + * omitted prefix attribute use this path, too. + */ + impliesNN = chosen; + chosen = NULL; + } + + /* + * We'll add the chosen inequality (or a deduced NOT NULL + * key) to startKeys[] below. + * + * Note: we usually won't be able to add any additional + * scan keys for index attributes beyond this one. This + * is okay for the same reason as the -inf/+inf case. + */ + } + /* * Done looking at keys for curattr. If we didn't find a * usable boundary key, see if we can deduce a NOT NULL key. @@ -1075,16 +1166,41 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) break; startKeys[keysz++] = chosen; + /* + * Skip arrays can also use a sk_argument which is marked + * "next key". This is another sentinel array element value + * requiring special handling here by us. As with -inf/+inf + * sentinels, there cannot be any exact non-pivot matches. + */ + if (chosen->sk_flags & (SK_BT_NEXTKEY | SK_BT_PREVKEY)) + { + Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(!(chosen->sk_flags & (SK_BT_NEG_INF | SK_BT_POS_INF))); + Assert(chosen->sk_strategy == BTEqualStrategyNumber); + + /* + * Adjust strat_total, so that our = key gets treated like + * a > key (or like a < key). + * + * The key is still conceptually a = key; we only do this + * because there's no explicit next/prev key we can use. + */ + if (chosen->sk_flags & SK_BT_NEXTKEY) + strat_total = BTGreaterStrategyNumber; + else + strat_total = BTLessStrategyNumber; + break; + } + /* * Adjust strat_total, and quit if we have stored a > or < * key. */ - strat = chosen->sk_strategy; - if (strat != BTEqualStrategyNumber) + if (chosen->sk_strategy != BTEqualStrategyNumber) { - strat_total = strat; - if (strat == BTGreaterStrategyNumber || - strat == BTLessStrategyNumber) + strat_total = chosen->sk_strategy; + if (chosen->sk_strategy == BTGreaterStrategyNumber || + chosen->sk_strategy == BTLessStrategyNumber) break; } @@ -1103,6 +1219,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) curattr = cur->sk_attno; chosen = NULL; impliesNN = NULL; + ichosen = -1; } /* @@ -1127,6 +1244,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTEqualStrategyNumber: /* override any non-equality choice */ chosen = cur; + ichosen = i; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index d6de2072d..133cb4687 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -29,9 +29,50 @@ #include "utils/memutils.h" #include "utils/rel.h" +/* + * GUC parameters (temporary convenience for reviewers). + * + * To disable all skipping, set skipscan_prefix_cols=0. Otherwise set it to + * the attribute number that you wish to make the last attribute number that + * we can add a skip scan key for. + * + * For example, setting skipscan_prefix_cols=1 before an index scan with qual + * "WHERE b = 1 AND c > 42" will make us generate a skip scan key on the + * column 'a' (which is attnum 1) only, preventing us from adding one for the + * column 'c' (and so 'c' will still have an inequality scan key, required in + * only one direction -- 'c' won't be output as a "range" skip key/array). + * + * The same scan keys will be output when skipscan_prefix_cols=2, given the + * same query/qual, since we naturally get a required equality scan key on 'b' + * from the input scan keys (provided we at least manage to add a skip scan + * key on 'a' that "anchors its required-ness" to the 'b' scan key.) + * + * When skipscan_prefix_cols is set to the number of key columns in the index, + * we're as aggressive as possible about adding skip scan arrays/scan keys. + * This is the current default behavior, and the behavior we're targeting for + * the committed patch (if there are slowdowns from being maximally aggressive + * here then the likely solution is to make _bt_advance_array_keys adaptive, + * rather than trying to predict what will work during preprocessing). + */ +int skipscan_prefix_cols = INDEX_MAX_KEYS; + +/* + * skipscan_skipsupport_enabled can be used to avoid using opclass skip + * support routines. This can be used to quantify the peformance benefit that + * comes from having dedicated skip support, with a given test query. + */ +bool skipscan_skipsupport_enabled = true; + #define LOOK_AHEAD_REQUIRED_RECHECKS 3 #define LOOK_AHEAD_DEFAULT_DISTANCE 5 +typedef struct BTSkipPreproc +{ + SkipSupportData sksup; /* opclass skip scan support */ + bool use_sksup; /* sksup set to valid routine? */ + Oid eq_op; /* InvalidOid means don't skip */ +} BTSkipPreproc; + typedef struct BTSortArrayContext { FmgrInfo *sortproc; @@ -62,17 +103,48 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok); -static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan); +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); +static int _bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts); +static bool _bt_skip_support(Relation rel, int add_skip_attno, + BTSkipPreproc *skipatts); +static inline Datum _bt_apply_decrement(Relation rel, ScanKey skey, + BTArrayKeyInfo *array); +static inline Datum _bt_apply_increment(Relation rel, ScanKey skey, + BTArrayKeyInfo *array); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur); + Datum arrdatum, bool arrnull, + ScanKey cur); +static void _bt_apply_compare_array(ScanKey arraysk, ScanKey skey, + FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); +static void _bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk, + ScanKey skey, FmgrInfo *orderproc, + FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); static int _bt_binsrch_array_skey(FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); +static void _bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static void _bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static bool _bt_scankey_skip_increment(Relation rel, ScanDirection dir, + BTArrayKeyInfo *array, ScanKey skey, + FmgrInfo *orderproc); +static void _bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull); +static void _bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, @@ -251,9 +323,6 @@ _bt_freestack(BTStack stack) * It is convenient for _bt_preprocess_keys caller to have to deal with no * more than one equality strategy array scan key per index attribute. We'll * always be able to set things up that way when complete opfamilies are used. - * Eliminated array scan keys can be recognized as those that have had their - * sk_strategy field set to InvalidStrategy here by us. Caller should avoid - * including these in the scan's so->keyData[] output array. * * We set the scan key references from the scan's BTArrayKeyInfo info array to * offsets into the temp modified input array returned to caller. Scans that @@ -261,18 +330,36 @@ _bt_freestack(BTStack stack) * preprocessing steps are complete. This will convert the scan key offset * references into references to the scan's so->keyData[] output scan keys. * + * We're also responsible for generating skip arrays (and their associated + * scan keys) here. This enables skip scan. We do this for index attributes + * that initially lacked an equality condition within scan->keyData[], iff + * doing so allows a later scan key (that was passed to us in scan->keyData[]) + * to be marked required by later preprocessing on output. + * _bt_decide_skipatts decides which attributes receive skip arrays. + * + * Caller must pass *numberOfKeys to give us a way to change the number of + * input scan keys (our output is caller's input). The returned array can be + * smaller than scan->keyData[] when we eliminated a redundant array scan key + * (redundant with some other array scan key, for the same attribute). It can + * also be larger when we added a skip array/skip scan key. Caller uses this + * to allocate so->keyData[] for the current btrescan. + * * Note: the reason we need to return a temp scan key array, rather than just * scribbling on scan->keyData, is that callers are permitted to call btrescan * without supplying a new set of scankey data. */ static ScanKey -_bt_preprocess_array_keys(IndexScanDesc scan) +_bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; - int numberOfKeys = scan->numberOfKeys; + int numArrayKeyData = scan->numberOfKeys; int16 *indoption = rel->rd_indoption; - int numArrayKeys; + BTSkipPreproc skipatts[INDEX_MAX_KEYS]; + int numArrayKeys, + numSkipArrayKeys, + output_ikey = 0; + AttrNumber attno_skip = 1; int origarrayatt = InvalidAttrNumber, origarraykey = -1; Oid origelemtype = InvalidOid; @@ -280,11 +367,14 @@ _bt_preprocess_array_keys(IndexScanDesc scan) MemoryContext oldContext; ScanKey arrayKeyData; /* modified copy of scan->keyData */ - Assert(numberOfKeys); + Assert(scan->numberOfKeys); - /* Quick check to see if there are any array keys */ + /* + * Quick check to see if there are any array keys, or any missing keys we + * can generate a "skip scan" array key for ourselves + */ numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) + for (int i = 0; i < scan->numberOfKeys; i++) { cur = &scan->keyData[i]; if (cur->sk_flags & SK_SEARCHARRAY) @@ -300,6 +390,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan) } } + /* Consider generating skip arrays, and associated equality scan keys */ + numSkipArrayKeys = _bt_decide_skipatts(scan, skipatts); + if (numSkipArrayKeys) + { + /* At least one skip array scan key must be added to arrayKeyData[] */ + numArrayKeys += numSkipArrayKeys; + /* output scan key buffer allocation needs space for skip scan keys */ + numArrayKeyData += numSkipArrayKeys; + } + /* Quit if nothing to do. */ if (numArrayKeys == 0) return NULL; @@ -317,19 +417,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan) oldContext = MemoryContextSwitchTo(so->arrayContext); - /* Create modifiable copy of scan->keyData in the workspace context */ - arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); - memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData)); + /* Create output scan keys in the workspace context */ + arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); /* Allocate space for ORDER procs used to help _bt_checkkeys */ - so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); + so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo)); - /* Now process each array key */ + /* + * Process each array key, and generate skip arrays as needed. Also copy + * every scan->keyData[] input scan key (whether it's an array or not) + * into the arrayKeyData array we'll return to our caller (barring any + * array scan keys that we could eliminate early through array merging). + */ numArrayKeys = 0; - for (int i = 0; i < numberOfKeys; i++) + for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++) { FmgrInfo sortproc; FmgrInfo *sortprocp = &sortproc; @@ -345,14 +449,88 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int num_nonnulls; int j; - cur = &arrayKeyData[i]; - if (!(cur->sk_flags & SK_SEARCHARRAY)) - continue; + /* Create a skip array and scan key where indicated by skipatts */ + while (numSkipArrayKeys && + attno_skip <= scan->keyData[input_ikey].sk_attno) + { + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + Oid collation = rel->rd_indcollation[attno_skip - 1]; + Oid eq_op = skipatts[attno_skip - 1].eq_op; + RegProcedure cmp_proc; + + if (!OidIsValid(eq_op)) + { + /* won't skip using this attribute */ + attno_skip++; + continue; + } + + cmp_proc = get_opcode(eq_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op); + + cur = &arrayKeyData[output_ikey]; + Assert(attno_skip <= scan->keyData[input_ikey].sk_attno); + ScanKeyEntryInitialize(cur, + SK_SEARCHARRAY | SK_BT_SKIP, /* flags */ + attno_skip, /* skipped att number */ + BTEqualStrategyNumber, /* equality strategy */ + InvalidOid, /* opclass input subtype */ + collation, /* index column's collation */ + cmp_proc, /* equality operator's proc */ + (Datum) 0); /* constant */ + + /* Initialize array fields */ + so->arrayKeys[numArrayKeys].scan_key = output_ikey; + so->arrayKeys[numArrayKeys].num_elems = -1; + so->arrayKeys[numArrayKeys].cur_elem = 0; + so->arrayKeys[numArrayKeys].elem_values = NULL; /* unusued */ + so->arrayKeys[numArrayKeys].use_sksup = skipatts[attno_skip - 1].use_sksup; + so->arrayKeys[numArrayKeys].null_elem = true; /* for now */ + so->arrayKeys[numArrayKeys].sksup = skipatts[attno_skip - 1].sksup; + so->arrayKeys[numArrayKeys].low_compare = NULL; /* for now */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* for now */ + + /* + * Temporary testing GUC can disable the use of an opclass's skip + * support routine + */ + if (!skipscan_skipsupport_enabled) + so->arrayKeys[numArrayKeys].use_sksup = false; + + /* + * We'll need a 3-way ORDER proc to determine when and how the + * consed-up "array" will advance inside _bt_advance_array_keys. + * Set one up now. + */ + _bt_setup_array_cmp(scan, cur, opcintype, + &so->orderProcs[output_ikey], NULL); + + /* + * Prepare to output next scan key (might be another skip scan + * key, or it could be an input scan key from scan->keyData[]) + */ + numSkipArrayKeys--; + numArrayKeys++; + attno_skip++; + output_ikey++; /* keep this scan key/array */ + } /* - * First, deconstruct the array into elements. Anything allocated - * here (including a possibly detoasted array value) is in the - * workspace context. + * Copy input scan key into temp arrayKeyData scan key array. (From + * here on, cur points at our copy of the input scan key.) + */ + cur = &arrayKeyData[output_ikey]; + *cur = scan->keyData[input_ikey]; + + if (!(cur->sk_flags & SK_SEARCHARRAY)) + { + output_ikey++; /* keep this non-array scan key */ + continue; + } + + /* + * Deconstruct the array into elements */ arrayval = DatumGetArrayTypeP(cur->sk_argument); /* We could cache this data, but not clear it's worth it */ @@ -406,6 +584,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); + output_ikey++; /* keep this transformed scan key */ continue; case BTEqualStrategyNumber: /* proceed with rest of loop */ @@ -416,6 +595,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); + output_ikey++; /* keep this transformed scan key */ continue; default: elog(ERROR, "unrecognized StrategyNumber: %d", @@ -432,7 +612,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * sortproc just points to the same proc used during binary searches. */ _bt_setup_array_cmp(scan, cur, elemtype, - &so->orderProcs[i], &sortprocp); + &so->orderProcs[output_ikey], &sortprocp); /* * Sort the non-null elements and eliminate any duplicates. We must @@ -476,11 +656,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) break; } - /* - * Indicate to _bt_preprocess_keys caller that it must ignore - * this scan key - */ - cur->sk_strategy = InvalidStrategy; + /* Throw away this array */ continue; } @@ -511,12 +687,19 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * Note: _bt_preprocess_array_keys_final will fix-up each array's * scan_key field later on, after so->keyData[] has been finalized. */ - so->arrayKeys[numArrayKeys].scan_key = i; + so->arrayKeys[numArrayKeys].scan_key = output_ikey; so->arrayKeys[numArrayKeys].num_elems = num_elems; so->arrayKeys[numArrayKeys].elem_values = elem_values; + so->arrayKeys[numArrayKeys].null_elem = false; /* unused */ + so->arrayKeys[numArrayKeys].use_sksup = false; /* redundant */ + so->arrayKeys[numArrayKeys].low_compare = NULL; /* unused */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* unused */ numArrayKeys++; + output_ikey++; /* keep this scan key/array */ } + /* Set final number of arrayKeyData[] keys, array keys */ + *numberOfKeys = output_ikey; so->numArrayKeys = numArrayKeys; MemoryContextSwitchTo(oldContext); @@ -624,7 +807,8 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) { BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; - Assert(array->num_elems > 0); + Assert(array->num_elems > 0 || array->num_elems == -1); + Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD); if (array->scan_key == input_ikey) { @@ -685,6 +869,241 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) so->numArrayKeys, INDEX_MAX_KEYS))); } +/* + * _bt_decide_skipatts() -- set index attributes requiring skip arrays + * + * _bt_preprocess_array_keys helper function. Determines which attributes + * will require skip arrays/scan keys. Also sets up skip support function for + * each of these attributes. + * + * This sets up "skip scan". Adding skip arrays (and associated scan keys) + * allows _bt_preprocess_keys to mark lower-order scan keys (copied from the + * original scan->keyData[] array in the conventional way) as required. The + * overall effect is to enable skipping over irrelevant sections of the index. + * + * Return value is the total number of scan keys to add as "input" scan keys + * for further processing within _bt_preprocess_keys. + */ +static int +_bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts) +{ + Relation rel = scan->indexRelation; + ScanKey inputsk; + AttrNumber attno_inputsk = 1, + attno_skip = 1; + bool attno_has_equal = false, + attno_has_rowcompare = false; + int numSkipArrayKeys = 0, + prev_numSkipArrayKeys = 0; + + Assert(scan->numberOfKeys); + + /* + * FIXME Also don't support parallel scans for now. Must add logic to + * places like _bt_parallel_primscan_schedule so that we account for skip + * arrays when parallel workers serialize their array scan state. + */ + if (scan->parallel_scan) + return 0; + + inputsk = &scan->keyData[0]; + for (int i = 0;; inputsk++, i++) + { + /* + * Backfill skip arrays for any wholly omitted attributes prior to + * attno_inputsk + */ + while (attno_skip < attno_inputsk) + { + if (!_bt_skip_support(rel, attno_skip, &skipatts[attno_skip - 1])) + { + /* + * Opclass lacks a suitable skip support routine. + * + * Return prev_numSkipArrayKeys, so as to avoid including any + * "backfilled" arrays that were supposed to form a contiguous + * group with a skip array on this attribute. There is no + * benefit to adding backfill skip arrays unless we can do so + * for all attributes (all attributes up to and including the + * one immediately before attno_inputsk). + */ + return prev_numSkipArrayKeys; + } + + /* plan on adding a backfill skip array for this attribute */ + numSkipArrayKeys++; + attno_skip++; + } + + /* + * Stop once past the final input scan key. We deliberately never add + * a skip attribute for the attribute of the last input scan key. + * + * If the last input scan key(s) use equality strategy, then a skip + * attribute is superfluous at best. If the last input scan key uses + * an inequality strategy, then adding a skip scan array/scan key is a + * valid though suboptimal transformation. It is better to arrange + * for preprocessing to allow such an input inequality scan key to + * remain an inequality on output. That way _bt_checkkeys will be + * able to make best use of both of its precheck optimizations, but + * _bt_first will be no less capable of efficiently finding the + * starting position for each primitive index scan. + */ + if (i >= scan->numberOfKeys) + break; + + /* + * Cannot keep adding skip arrays after a RowCompare + */ + if (attno_has_rowcompare) + break; + + /* + * Apply temporary testing GUC that can be used to disable skipping + * (either in part or in whole) + */ + if (attno_inputsk > skipscan_prefix_cols) + break; + + /* + * Now consider next attno_inputsk (or keep going if this is an + * additional scan key against the same attribute) + */ + if (attno_inputsk < inputsk->sk_attno) + { + prev_numSkipArrayKeys = numSkipArrayKeys; + + /* + * Now add skip array for previous scan key's attribute, though + * only if the attribute has no equality strategy scan keys. + * + * Adding skip arrays to an attribute that has one or more + * inequality scan keys will cause preprocessing to output a range + * skip array. This will happen when preprocessing proper deals + * with the redundancy between the array and its inequalities. + */ + skipatts[attno_skip - 1].eq_op = InvalidOid; + if (!attno_has_equal) + { + /* Only saw inequalities for the prior attribute */ + if (_bt_skip_support(rel, attno_skip, + &skipatts[attno_skip - 1])) + { + /* add a range skip array for this attribute */ + numSkipArrayKeys++; + } + else + break; + } + else + { + /* + * Saw an equality for the prior attribute, so it doesn't need + * a skip array (not even a range skip array). We'll be able + * to add later skip arrays, too (doesn't matter if the prior + * attribute uses an input opclass without skip support). + */ + } + + /* Set things up for this new attribute */ + attno_skip++; + attno_inputsk = inputsk->sk_attno; + attno_has_equal = false; + } + + /* + * Track if this scan key's attribute has any equality strategy scan + * keys. + * + * Treat IS NULL scan keys as using equal strategy (they'll be marked + * as using it later on, by _bt_fix_scankey_strategy). + */ + if (inputsk->sk_strategy == BTEqualStrategyNumber || + (inputsk->sk_flags & SK_SEARCHNULL)) + attno_has_equal = true; + + /* + * We don't support RowCompare transformation. Remember that we saw a + * RowCompare, so that we don't keep adding skip attributes. + * + * We do still backfill skip attributes before the RowCompare, so that + * it can be marked required. This is similar to what happens when a + * conventional inequality uses an opclass that lacks skip support. + */ + if (inputsk->sk_flags & SK_ROW_HEADER) + attno_has_rowcompare = true; + } + + return numSkipArrayKeys; +} + +/* + * _bt_skip_support() -- set up skip support function in *skipatts + * + * Returns true on success, indicating that we set *skipatts with input + * opclass's equality operator. Otherwise returns false. + */ +static bool +_bt_skip_support(Relation rel, int add_skip_attno, BTSkipPreproc *skipatts) +{ + int16 *indoption = rel->rd_indoption; + Oid opfamily = rel->rd_opfamily[add_skip_attno - 1]; + Oid opcintype = rel->rd_opcintype[add_skip_attno - 1]; + bool reverse; + + /* Look up input opclass's equality operator */ + skipatts->eq_op = get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + /* + * We don't expect input opclasses lacking even an equality operator, but + * it's possible. Deal with it gracefully. + */ + if (!OidIsValid(skipatts->eq_op)) + return false; + + /* Have skip support infrastructure set all SkipSupport fields */ + reverse = (indoption[add_skip_attno - 1] & INDOPTION_DESC) != 0; + skipatts->use_sksup = PrepareSkipSupportFromOpclass(opfamily, opcintype, + reverse, + &skipatts->sksup); + + /* might not have set up skip support routine, but can skip either way */ + return true; +} + +/* + * _bt_apply_decrement() -- Get a decremented copy of skey's arg + * + * Note: this wrapper function calls the opclass increment function when the + * index stores values in descending order. We're "logically decrementing" to + * the previous value in the key space regardless. + */ +static inline Datum +_bt_apply_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + if (!(skey->sk_flags & SK_BT_DESC)) + return array->sksup.decrement(rel, skey->sk_argument); + else + return array->sksup.increment(rel, skey->sk_argument); +} + +/* + * _bt_apply_increment() -- Get an incremented copy of skey's arg + * + * Note: this wrapper function calls the opclass decrement function when the + * index stores values in descending order. We're "logically incrementing" to + * the next value in the key space regardless. + */ +static inline Datum +_bt_apply_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + if (!(skey->sk_flags & SK_BT_DESC)) + return array->sksup.increment(rel, skey->sk_argument); + else + return array->sksup.decrement(rel, skey->sk_argument); +} + /* * _bt_setup_array_cmp() -- Set up array comparison functions * @@ -979,15 +1398,10 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey { Relation rel = scan->indexRelation; Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; - int cmpresult = 0, - cmpexact = 0, - matchelem, - new_nelems = 0; FmgrInfo crosstypeproc; FmgrInfo *orderprocp = orderproc; Assert(arraysk->sk_attno == skey->sk_attno); - Assert(array->num_elems > 0); Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); Assert((arraysk->sk_flags & SK_SEARCHARRAY) && arraysk->sk_strategy == BTEqualStrategyNumber); @@ -1000,8 +1414,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * datum of opclass input type for the index's attribute (on-disk type). * We can reuse the array's ORDER proc whenever the non-array scan key's * type is a match for the corresponding attribute's input opclass type. - * Otherwise, we have to do another ORDER proc lookup so that our call to - * _bt_binsrch_array_skey applies the correct comparator. + * Otherwise, we have to do another ORDER proc lookup. We have to be sure + * that _bt_compare_array_skey/_bt_binsrch_array_skey use the right proc. * * Note: we have to support the convention that sk_subtype == InvalidOid * means the opclass input type; this is a hack to simplify life for @@ -1032,11 +1446,45 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey return false; } - /* We have all we need to determine redundancy/contradictoriness */ orderprocp = &crosstypeproc; fmgr_info(cmp_proc, orderprocp); } + /* + * We have all we need to determine redundancy/contradictoriness. + * + * Perform preprocessing of the array based on whether it's a conventional + * array, or a skip array. Sets *qual_ok correctly in passing. + */ + if (array->num_elems != -1) + _bt_apply_compare_array(arraysk, skey, + orderprocp, array, qual_ok); + else + _bt_apply_compare_skiparray(scan, arraysk, skey, orderproc, + orderprocp, array, qual_ok); + + return true; +} + +/* + * Finish off preprocessing of conventional (non-skip) array scan key when it + * is redundant with (or contradicted by) a non-array scalar scan key. + * + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + */ +static void +_bt_apply_compare_array(ScanKey arraysk, ScanKey skey, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok) +{ + int cmpresult = 0, + cmpexact = 0, + matchelem, + new_nelems = 0; + + Assert(array->num_elems > 0); + Assert(!(arraysk->sk_flags & SK_BT_SKIP)); + matchelem = _bt_binsrch_array_skey(orderprocp, false, NoMovementScanDirection, skey->sk_argument, false, array, @@ -1088,8 +1536,175 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey array->num_elems = new_nelems; *qual_ok = new_nelems > 0; +} - return true; +/* + * Finish off preprocessing of skip array scan key when it is redundant with + * (or contradicted by) a non-array scalar scan key. + * + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + * + * Arrays used to skip (skip scan/missing key attribute predicates) work by + * procedurally generating their elements on the fly. We must still + * "eliminate contradictory elements", but it works a little differently: we + * narrow the range of the skip array, such that the array will never + * generated contradicted-by-skey elements. + * + * FIXME Our behavior in scenarios with cross-type operators (range skip scan + * cases) is buggy. We're naively copying datums of a different type from + * scalar inequality scan keys into the array's low_value and high_value + * fields. In practice this tends to not visibly break (in practice types + * that appear within the same operator family tend to have compatible datum + * representations, at least on systems with little-endian byte order). Put + * off dealing with the problem until a later revision of the patch. + * + * It seems likely that the best way to fix this problem will involve keeping + * around the original operator in the BTArrayKeyInfo array struct whenever + * we're passed a "redundant" cross-type inequality operator (an approach + * involving casts/coercions might be tempting, but seems much too fragile). + * We only need to use not-column-input-opclass-type operators for the first + * and/or last array elements from the skip array under this scheme; we'll + * still mostly be dealing with opcintype-typed datums, copied from the index + * (as well as incrementing/decrementing copies of those index tuple datums). + * Importantly, this scheme should work just as well with an opfamily that + * doesn't even have an orderprocp cross-type ORDER operator to pass us here + * (we might even have to keep more than one same-strategy inequality, since + * in general _bt_preprocess_keys might not be able to prove which inequality + * is redundant). + */ +static void +_bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Form_pg_attribute attr = TupleDescAttr(RelationGetDescr(rel), + skey->sk_attno - 1); + MemoryContext oldContext; + int cmpresult; + + /* + * We don't expect to have to deal with NULLs in non-array/non-skip scan + * key. We expect _bt_preprocess_array_keys to avoid generating a skip + * array for an index attribute with an IS NULL input scan key. It will + * still do so in the presence of IS NOT NULL input scan keys, but + * _bt_compare_scankey_args is expected to handle those for us. + */ + Assert(arraysk->sk_flags & SK_BT_SKIP); + Assert(arraysk->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & SK_ISNULL)); + Assert(array->num_elems == -1); + + /* + * Scalar scan key must be a B-Tree operator, which must always be strict. + * Array shouldn't generate a NULL "array element"/an IS NULL qual. This + * isn't just an optimization; it's strictly necessary for correctness. + */ + array->null_elem = false; + + if (!array->use_sksup) + { + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + array->high_compare = MemoryContextAlloc(so->arrayContext, + sizeof(ScanKeyData)); + memcpy(array->high_compare, skey, sizeof(ScanKeyData)); + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + array->low_compare = MemoryContextAlloc(so->arrayContext, + sizeof(ScanKeyData)); + memcpy(array->low_compare, skey, sizeof(ScanKeyData)); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + array->null_elem = false; + *qual_ok = true; + + return; + } + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + + /* + * detect if scan key argument will be < low_value once + * decremented + */ + cmpresult = _bt_compare_array_skey(orderprocp, + skey->sk_argument, false, + array->sksup.low_elem, false, + arraysk); + if (cmpresult <= 0) + { + /* decrementing would make qual unsatisfiable, so don't try */ + *qual_ok = false; + return; + } + + /* decremented scan key value becomes skip array's new high_value */ + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.high_elem = _bt_apply_decrement(rel, skey, array); + MemoryContextSwitchTo(oldContext); + break; + case BTLessEqualStrategyNumber: + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.high_elem = datumCopy(skey->sk_argument, + attr->attbyval, attr->attlen); + MemoryContextSwitchTo(oldContext); + break; + case BTGreaterEqualStrategyNumber: + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.low_elem = datumCopy(skey->sk_argument, + attr->attbyval, attr->attlen); + MemoryContextSwitchTo(oldContext); + break; + case BTGreaterStrategyNumber: + + /* + * detect if scan key argument will be > high_value once + * incremented + */ + cmpresult = _bt_compare_array_skey(orderprocp, + skey->sk_argument, false, + array->sksup.high_elem, false, + arraysk); + if (cmpresult >= 0) + { + /* incrementing would make qual unsatisfiable, so don't try */ + *qual_ok = false; + return; + } + + /* incremented scan key value becomes skip array's new low_value */ + oldContext = MemoryContextSwitchTo(so->arrayContext); + array->sksup.low_elem = _bt_apply_increment(rel, skey, array); + MemoryContextSwitchTo(oldContext); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + /* + * Is the qual contradictory, or is it merely "redundant" with consed-up + * skip array? + */ + cmpresult = _bt_compare_array_skey(orderproc, /* don't use orderprocp */ + array->sksup.low_elem, false, + array->sksup.high_elem, false, + arraysk); + *qual_ok = (cmpresult <= 0); } /* @@ -1130,7 +1745,8 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur) + Datum arrdatum, bool arrnull, + ScanKey cur) { int32 result = 0; @@ -1138,14 +1754,14 @@ _bt_compare_array_skey(FmgrInfo *orderproc, if (tupnull) /* NULL tupdatum */ { - if (cur->sk_flags & SK_ISNULL) + if (arrnull) result = 0; /* NULL "=" NULL */ else if (cur->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } - else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + else if (arrnull) /* NOT_NULL tupdatum, NULL arrdatum */ { if (cur->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ @@ -1211,6 +1827,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum arrdatum; Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* plain arrays can't do this */ Assert(cur->sk_strategy == BTEqualStrategyNumber); if (cur_elem_trig) @@ -1246,7 +1864,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[low_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result <= 0) { @@ -1274,7 +1892,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[high_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result >= 0) { @@ -1301,7 +1919,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, arrdatum = array->elem_values[mid_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result == 0) { @@ -1326,13 +1944,196 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, */ if (low_elem != mid_elem) result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - array->elem_values[low_elem], cur); + array->elem_values[low_elem], false, + cur); *set_elem_result = result; return low_elem; } +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * Skip scan arrays procedurally generate their elements on-demand. They + * largely function in the same way as standard arrays. They can be rolled + * over by standard arrays (standard array can also roll over skip arrays). + * + * This routine doesn't return an index into the array, because the array + * doesn't actually have any elements (it has low_value and high_value, which + * indicate the range of values that the array can generate). Note that this + * may include a NULL value/an IS NULL qual (unlike with true arrays). + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. Other values indicate what _bt_compare_array_skey returned + * for the best available match to tupdatum/tupnull (in practice this means + * either the lowest item or the highest item in the range of the array). + * + * cur_elem_trig indicates if array advancement was triggered by this skip + * array's scan key. We can apply this information to find the next matching + * array element in the current scan direction using fewer comparisons. + */ +static void +_bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Datum arrdatum; + bool arrnull; + + Assert(!ScanDirectionIsNoMovement(dir)); + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + + /* Precheck for NULL tupdatum, array without a NULL element */ + if (tupnull && !array->null_elem) + { + if (!(cur->sk_flags & SK_BT_NULLS_FIRST)) + *set_elem_result = 1; + else + *set_elem_result = -1; + + return; + } + + /* + * Compare tupdatum against "first array element" in the current scan + * direction first (and allow NULL to be treated as a possible element). + * + * Optimization: don't have to bother with this when passed a skip array + * that is known to have triggered array advancement. + */ + if (!cur_elem_trig) + { + if (array->use_sksup) + { + if (ScanDirectionIsForward(dir)) + { + arrdatum = array->sksup.low_elem; + arrnull = array->null_elem && + (cur->sk_flags & SK_BT_NULLS_FIRST); + } + else + { + arrdatum = array->sksup.high_elem; + arrnull = array->null_elem && + !(cur->sk_flags & SK_BT_NULLS_FIRST); + } + + *set_elem_result = _bt_compare_array_skey(orderproc, + tupdatum, tupnull, + arrdatum, arrnull, cur); + + /* + * Optimization: return early when >= lower bound happens to be an + * exact match (or when <= upper bound is an exact match during a + * backwards scan) + */ + if (*set_elem_result == 0) + return; + } + else + { + *set_elem_result = 0; /* for now */ + + if (ScanDirectionIsForward(dir) && array->low_compare) + { + ScanKey low_compare = array->low_compare; + + if (!DatumGetBool(FunctionCall2Coll(&low_compare->sk_func, + low_compare->sk_collation, + tupdatum, + low_compare->sk_argument))) + *set_elem_result = -1; + } + else if (ScanDirectionIsBackward(dir) && array->high_compare) + { + ScanKey high_compare = array->high_compare; + + if (!DatumGetBool(FunctionCall2Coll(&high_compare->sk_func, + high_compare->sk_collation, + tupdatum, + high_compare->sk_argument))) + *set_elem_result = 1; + } + } + + /* tupdatum before the start of first element in scan direction? */ + if ((ScanDirectionIsForward(dir) && *set_elem_result < 0) || + (ScanDirectionIsBackward(dir) && *set_elem_result > 0)) + return; + } + + /* + * Now compare tupdatum to the last array element in the current scan + * direction (and allow NULL to be treated as a possible element) + */ + if (array->use_sksup) + { + /* + * We have skip support, so there is literally a final element + */ + if (ScanDirectionIsForward(dir)) + { + arrdatum = array->sksup.high_elem; + arrnull = array->null_elem && !(cur->sk_flags & SK_BT_NULLS_FIRST); + } + else + { + arrdatum = array->sksup.low_elem; + arrnull = array->null_elem && (cur->sk_flags & SK_BT_NULLS_FIRST); + } + *set_elem_result = _bt_compare_array_skey(orderproc, + tupdatum, tupnull, + arrdatum, arrnull, cur); + } + else + { + *set_elem_result = 0; /* for now */ + + /* + * No skip support. Need to use any inequalities required in the + * current scan direction as demarcating where the final element is. + */ + if (ScanDirectionIsForward(dir) && array->high_compare) + { + ScanKey high_compare = array->high_compare; + + if (!DatumGetBool(FunctionCall2Coll(&high_compare->sk_func, + high_compare->sk_collation, + tupdatum, + high_compare->sk_argument))) + *set_elem_result = 1; + } + else if (ScanDirectionIsBackward(dir) && array->low_compare) + { + ScanKey low_compare = array->low_compare; + + if (!DatumGetBool(FunctionCall2Coll(&low_compare->sk_func, + low_compare->sk_collation, + tupdatum, + low_compare->sk_argument))) + *set_elem_result = -1; + } + } + + /* tupdatum after the end of final element in scan direction? */ + if ((ScanDirectionIsForward(dir) && *set_elem_result > 0) || + (ScanDirectionIsBackward(dir) && *set_elem_result < 0)) + return; + + /* + * tupdatum is within the range of the skip array. This is equivalent to + * _bt_binsrch_array_skey finding an exactly matching array element. + */ + *set_elem_result = 0; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -1342,29 +2143,488 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; Assert(so->numArrayKeys); Assert(so->qual_ok); - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->keyData[curArrayKey->scan_key]; - Assert(curArrayKey->num_elems > 0); Assert(skey->sk_flags & SK_SEARCHARRAY); - if (ScanDirectionIsBackward(dir)) - curArrayKey->cur_elem = curArrayKey->num_elems - 1; - else - curArrayKey->cur_elem = 0; - skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + _bt_scankey_set_low_or_high(rel, skey, curArrayKey, + ScanDirectionIsForward(dir)); } so->scanBehind = false; } +/* + * _bt_scankey_decrement() -- decrement scan key's sk_argument + * + * Unsets scan key "IS NULL" flags if required. Cannot handle "decrementing" + * sk_argument from a non-NULL value to the value NULL. + */ +static void +_bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (skey->sk_flags & SK_ISNULL) + _bt_scankey_unset_isnull(rel, skey, array); + else + { + Datum dec_sk_argument; + Form_pg_attribute attr; + + /* Get a decremented copy of existing sk_argument */ + dec_sk_argument = _bt_apply_decrement(rel, skey, array); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set decremented copy of original sk_argument in scan key */ + skey->sk_argument = dec_sk_argument; + } +} + +/* + * _bt_scankey_increment() -- increment scan key's sk_argument + * + * Unsets scan key "IS NULL" flags if required. Cannot handle "incrementing" + * sk_argument from a non-NULL value to the value NULL. + */ +static void +_bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->use_sksup); + + if (skey->sk_flags & SK_ISNULL) + _bt_scankey_unset_isnull(rel, skey, array); + else + { + Datum inc_sk_argument; + Form_pg_attribute attr; + + /* Get an incremented copy of existing sk_argument */ + inc_sk_argument = _bt_apply_increment(rel, skey, array); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set incremented copy of original sk_argument in scan key */ + skey->sk_argument = inc_sk_argument; + } +} + +/* + * _bt_scankey_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_scankey_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for conventional array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Clear possibly-irrelevant flags (before possible setting some again) */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEG_INF | SK_BT_POS_INF | + SK_BT_NEXTKEY | SK_BT_PREVKEY); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Set element to NULL (lowest/highest element) */ + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else if (low_not_high) + { + /* Lowest array element isn't NULL */ + ScanKey low_compare = array->low_compare; + + if (array->use_sksup) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else if (!low_compare) + skey->sk_flags |= SK_BT_NEG_INF; + else if (low_compare->sk_subtype != InvalidOid && + low_compare->sk_subtype != + rel->rd_opcintype[skey->sk_attno - 1]) + { + /* XXX papers-over lack of cross-type support in _bt_first */ + skey->sk_flags |= SK_BT_NEG_INF; + } + else + { + skey->sk_argument = datumCopy(low_compare->sk_argument, + attr->attbyval, attr->attlen); + + if (low_compare->sk_strategy == BTGreaterStrategyNumber) + skey->sk_flags |= SK_BT_NEXTKEY; + } + } + else + { + /* Highest array element isn't NULL */ + ScanKey high_compare = array->high_compare; + + if (array->use_sksup) + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); + else if (!high_compare) + skey->sk_flags |= SK_BT_POS_INF; + else if (high_compare->sk_subtype != InvalidOid && + high_compare->sk_subtype != + rel->rd_opcintype[skey->sk_attno - 1]) + { + /* XXX papers-over lack of cross-type support in _bt_first */ + skey->sk_flags |= SK_BT_POS_INF; + } + else + { + skey->sk_argument = datumCopy(high_compare->sk_argument, + attr->attbyval, attr->attlen); + if (high_compare->sk_strategy == BTLessStrategyNumber) + skey->sk_flags |= SK_BT_PREVKEY; + } + } +} + +/* + * _bt_scankey_skip_increment() -- increment a skip scan key, and its array + * + * Returns true when the skip array was successfully incremented to the next + * value in the current scan direction, dir. Otherwise handles roll over by + * setting array to its final element for the current scan direction. + */ +static bool +_bt_scankey_skip_increment(Relation rel, ScanDirection dir, + BTArrayKeyInfo *array, ScanKey skey, + FmgrInfo *orderproc) +{ + Datum sk_argument = skey->sk_argument; + bool sk_isnull = (skey->sk_flags & SK_ISNULL) != 0; + int compare; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(array->num_elems == -1); + + /* + * Precheck for the sentinel values -inf and +inf. These values are only + * used for index columns whose input operator class doesn't provide its + * own skip support routine. + */ + Assert(!(skey->sk_flags & SK_BT_POS_INF) || ScanDirectionIsForward(dir)); + Assert(!(skey->sk_flags & SK_BT_NEG_INF) || ScanDirectionIsBackward(dir)); + if (skey->sk_flags & (SK_BT_POS_INF | SK_BT_NEG_INF)) + { + Assert(!array->use_sksup); + goto rollover; + } + + skey->sk_flags &= ~(SK_BT_NEXTKEY | SK_BT_PREVKEY); + + if (ScanDirectionIsForward(dir)) + { + if (array->high_compare) + { + ScanKey high_compare = array->high_compare; + + Assert(!array->use_sksup); + Assert(!array->null_elem && !sk_isnull); + + if (high_compare->sk_strategy == BTLessEqualStrategyNumber) + { + /* XXX Need to consider cross-type operator families here */ + compare = _bt_compare_array_skey(orderproc, + high_compare->sk_argument, false, + sk_argument, sk_isnull, skey); + if (compare <= 0) + goto rollover; + } + else if (!DatumGetBool(FunctionCall2Coll(&high_compare->sk_func, + high_compare->sk_collation, + sk_argument, + high_compare->sk_argument))) + goto rollover; + } + + if (!array->use_sksup) + { + /* + * Optimization: when the current array element is NULL, and the + * last item stored in the index is also NULL, treat NULL as the + * final array element (final when scanning forwards). + * + * This saves a useless primitive index scan that would otherwise + * try to locate a value after NULL. + */ + if (sk_isnull && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + goto rollover; + + /* "Increment" sk_argument to sentinel value */ + skey->sk_flags |= SK_BT_NEXTKEY; + return true; + } + + /* high_elem is final non-NULL element in current scan direction */ + compare = _bt_compare_array_skey(orderproc, + array->sksup.high_elem, false, + sk_argument, sk_isnull, skey); + if (compare > 0) + { + /* Increment sk_argument to next non-NULL array element */ + _bt_scankey_increment(rel, skey, array); + + return true; + } + else if (compare == 0 && array->null_elem && + !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument is already equal to non-NULL high_elem, + * but skip array's true highest element is actually NULL. + * + * "Increment" sk_argument to NULL. + */ + _bt_scankey_set_isnull(rel, skey, array); + + return true; + } + + /* Exhausted all array elements in current scan direction */ + } + else + { + if (array->low_compare) + { + ScanKey low_compare = array->low_compare; + + Assert(!array->use_sksup); + Assert(!array->null_elem && !sk_isnull); + + if (low_compare->sk_strategy == BTGreaterEqualStrategyNumber) + { + /* XXX Need to consider cross-type operator families here */ + compare = _bt_compare_array_skey(orderproc, + low_compare->sk_argument, false, + sk_argument, sk_isnull, skey); + if (compare >= 0) + goto rollover; + } + else if (!DatumGetBool(FunctionCall2Coll(&low_compare->sk_func, + low_compare->sk_collation, + sk_argument, + low_compare->sk_argument))) + goto rollover; + } + + if (!array->use_sksup) + { + /* + * Optimization: when the current array element is NULL, and the + * first item stored in the index is also NULL, treat NULL as the + * final array element (final when scanning backwards). + * + * This saves a useless primitive index scan that would otherwise + * try to locate a value before NULL. + */ + if (sk_isnull && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + goto rollover; + + /* "Decrement" sk_argument to sentinel value */ + skey->sk_flags |= SK_BT_PREVKEY; + return true; + } + + /* low_elem is final non-NULL element in current scan direction */ + compare = _bt_compare_array_skey(orderproc, + array->sksup.low_elem, false, + sk_argument, sk_isnull, skey); + if (compare < 0) + { + /* Decrement sk_argument to previous non-NULL array element */ + _bt_scankey_decrement(rel, skey, array); + + return true; + } + else if (compare == 0 && array->null_elem && + (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument is already equal to non-NULL low_elem, but + * skip array's true lowest element is actually NULL. + * + * "Decrement" sk_argument to NULL. + */ + _bt_scankey_set_isnull(rel, skey, array); + + return true; + } + + /* Exhausted all array elements in current scan direction */ + } + + /* + * Skip array rolls over. Start over at the array's lowest sorting value + * (or its highest value, for backward scans). + */ +rollover: + + _bt_scankey_set_low_or_high(rel, skey, array, ScanDirectionIsForward(dir)); + + /* Caller must consider earlier/more significant arrays in turn */ + return false; +} + +/* + * _bt_scankey_set_element() -- Set skip array scan key's sk_argument + * + * Sets scan key to "IS NULL" when required, and handles memory management for + * pass-by-reference types. + */ +static void +_bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull) +{ + /* tupdatum within the range of low_value/high_value */ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(tupnull && !array->null_elem)); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEG_INF | SK_BT_POS_INF | + SK_BT_NEXTKEY | SK_BT_PREVKEY); + + /* + * Treat tupdatum/tupnull as a matching array element. + * + * We just copy tupdatum into the array's scan key (there is no + * conventional array element for us to set, of course). + * + * Unlike standard arrays, skip arrays sometimes need to locate NULLs. + * Treat them as just another value from the domain of indexed values. + */ + if (!tupnull) + skey->sk_argument = datumCopy(tupdatum, attr->attbyval, attr->attlen); + else + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + +/* + * _bt_scankey_unset_isnull() -- increment/decrement scan key from NULL + * + * Unsets scan key's "IS NULL" marking, and sets the non-NULL value from the + * array immediately before (or immediate after) NULL in the key space. + */ +static void +_bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(skey->sk_flags & SK_ISNULL); + Assert(array->use_sksup); + Assert(array->null_elem); + + /* + * sk_argument must be set to whatever non-NULL value comes immediately + * before or after NULL + */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEG_INF | SK_BT_POS_INF | + SK_BT_NEXTKEY | SK_BT_PREVKEY); + if (skey->sk_flags & SK_BT_NULLS_FIRST) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); +} + +/* + * _bt_scankey_set_isnull() -- increment/decrement scan key to NULL + * + * Sets scan key to "IS NULL", and handles memory management for + * pass-by-reference types. + */ +static void +_bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEG_INF | SK_BT_POS_INF | + SK_BT_NEXTKEY | SK_BT_PREVKEY))); + Assert(array->null_elem); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set sk_argument to NULL */ + skey->sk_argument = (Datum) 0; + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + /* * _bt_advance_array_keys_increment() -- Advance to next set of array elements * @@ -1380,6 +2640,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; /* @@ -1391,10 +2652,24 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->keyData[curArrayKey->scan_key]; + FmgrInfo *orderproc = &so->orderProcs[curArrayKey->scan_key]; int cur_elem = curArrayKey->cur_elem; int num_elems = curArrayKey->num_elems; bool rolled = false; + /* Handle incrementing a skip array */ + if (num_elems == -1) + { + /* Attempt to incrementally advance this skip scan array */ + if (_bt_scankey_skip_increment(rel, dir, curArrayKey, skey, + orderproc)) + return true; + + /* Array rolled over. Need to advance next array key, if any. */ + continue; + } + + /* Handle incrementing a true array */ if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) { cur_elem = 0; @@ -1411,7 +2686,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) if (!rolled) return true; - /* Need to advance next array key, if any */ + /* Array rolled over. Need to advance next array key, if any. */ } /* @@ -1466,6 +2741,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int arrayidx = 0; @@ -1473,7 +2749,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { ScanKey cur = so->keyData + ikey; BTArrayKeyInfo *array = NULL; - int first_elem_dir; if (!(cur->sk_flags & SK_SEARCHARRAY) || cur->sk_strategy != BTEqualStrategyNumber) @@ -1485,16 +2760,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) continue; - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; + Assert(array->num_elems != -1); /* No skipping of non-required arrays */ - if (array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -1558,6 +2827,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) { ScanKey cur = so->keyData + ikey; + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; Datum tupdatum; bool tupnull; int32 result; @@ -1617,11 +2888,14 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, continue; } + if (cur->sk_flags & (SK_BT_NEG_INF | SK_BT_POS_INF)) + return false; + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); result = _bt_compare_array_skey(&so->orderProcs[ikey], tupdatum, tupnull, - cur->sk_argument, cur); + sk_argument, sk_isnull, cur); /* * Does this comparison indicate that caller must _not_ advance the @@ -1631,6 +2905,9 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, (ScanDirectionIsBackward(dir) && result > 0)) return true; + if ((cur->sk_flags & (SK_BT_NEXTKEY | SK_BT_PREVKEY)) && result == 0) + return true; + /* * Does this comparison indicate that caller should now advance the * scan's arrays? (Must be if we get here during a readpagetup call.) @@ -1954,18 +3231,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (beyond_end_advance) { - int final_elem_dir; - - if (ScanDirectionIsBackward(dir) || !array) - final_elem_dir = 0; - else - final_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != final_elem_dir) - { - array->cur_elem = final_elem_dir; - cur->sk_argument = array->elem_values[final_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); continue; } @@ -1990,18 +3258,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (!all_required_satisfied || cur->sk_attno > tupnatts) { - int first_elem_dir; - - if (ScanDirectionIsForward(dir) || !array) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); continue; } @@ -2019,15 +3278,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* * Binary search for closest match that's available from the array */ - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); + if (array->num_elems != -1) + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); - Assert(set_elem >= 0 && set_elem < array->num_elems); + /* + * Skip array. "Binary search" by checking if tupdatum/tupnull + * are within the low_value/high_value range of the skip array. + */ + else + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); } else { + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; + Assert(sktrig_required && required); /* @@ -2041,7 +3312,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ result = _bt_compare_array_skey(&so->orderProcs[ikey], tupdatum, tupnull, - cur->sk_argument, cur); + sk_argument, sk_isnull, cur); } /* @@ -2100,11 +3371,62 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } } - /* Advance array keys, even when set_elem isn't an exact match */ - if (array && array->cur_elem != set_elem) + /* Advance array keys, even when we don't have an exact match */ + + if (!array) + continue; /* no element to set in non-array */ + + /* Conventional arrays have a valid set_elem for us to advance to */ + if (array->num_elems != -1) { - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; + if (array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } + + continue; + } + + /* + * Conceptually, skip arrays also have array elements. The actual + * elements/values are generated procedurally and on demand. + */ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + Assert(required); + + if (result == 0) + { + /* + * Anything within the range of possible element values is treated + * as "a match for one of the array's elements". Store the next + * scan key argument value by taking a copy of the tupdatum value + * from caller's tuple (or set scan key IS NULL when tupnull, iff + * the array's range of possible elements covers NULL). + */ + _bt_scankey_set_element(rel, cur, array, tupdatum, tupnull); + } + else if (beyond_end_advance) + { + /* + * We need to set the array element to the final "element" in the + * current scan direction for "beyond end of array element" array + * advancement. See above for an explanation. + */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); + } + else + { + /* + * The closest matching element is the lowest element; even that + * still puts us ahead of caller's tuple in the key space. This + * process has to carry to any lower-order arrays. See above for + * an explanation. + */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -2460,10 +3782,12 @@ end_toplevel_scan: /* * _bt_preprocess_keys() -- Preprocess scan keys * + * The first call here (per btrescan) allocates so->keyData[]. * The given search-type keys (taken from scan->keyData[]) * are copied to so->keyData[] with possible transformation. * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets - * the number of output keys (possibly less, never greater). + * the number of output keys. Calling here a second time (during the same + * btrescan) is a no-op. * * The output keys are marked with additional sk_flags bits beyond the * system-standard bits supplied by the caller. The DESC and NULLS_FIRST @@ -2483,6 +3807,8 @@ end_toplevel_scan: * within each attribute may be done as a byproduct of the processing here. * That process must leave array scan keys (within an attribute) in the same * order as corresponding entries from the scan's BTArrayKeyInfo array info. + * We might also cons up skip array scan keys that weren't present in the + * original input keys; these are also output in standard attribute order. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -2550,9 +3876,7 @@ _bt_preprocess_keys(IndexScanDesc scan) int16 *indoption = scan->indexRelation->rd_indoption; int new_numberOfKeys; int numberOfEqualCols; - ScanKey inkeys; - ScanKey outkeys; - ScanKey cur; + ScanKey inputsk; BTScanKeyPreproc xform[BTMaxStrategyNumber]; bool test_result; int i, @@ -2584,7 +3908,7 @@ _bt_preprocess_keys(IndexScanDesc scan) return; /* done if qual-less scan */ /* If any keys are SK_SEARCHARRAY type, set up array-key info */ - arrayKeyData = _bt_preprocess_array_keys(scan); + arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys); if (!so->qual_ok) { /* unmatchable array, so give up */ @@ -2598,32 +3922,36 @@ _bt_preprocess_keys(IndexScanDesc scan) */ if (arrayKeyData) { - inkeys = arrayKeyData; + inputsk = arrayKeyData; /* Also maintain keyDataMap for remapping so->orderProc[] later */ keyDataMap = MemoryContextAlloc(so->arrayContext, numberOfKeys * sizeof(int)); } else - inkeys = scan->keyData; + inputsk = scan->keyData; + + /* + * Now that we have an estimate of the number of output scan keys + * (including any skip array scan keys), allocate space for them + */ + so->keyData = palloc(sizeof(ScanKeyData) * numberOfKeys); - outkeys = so->keyData; - cur = &inkeys[0]; /* we check that input keys are correctly ordered */ - if (cur->sk_attno < 1) + if (inputsk->sk_attno < 1) elog(ERROR, "btree index keys must be ordered by attribute"); /* We can short-circuit most of the work if there's just one key */ if (numberOfKeys == 1) { /* Apply indoption to scankey (might change sk_strategy!) */ - if (!_bt_fix_scankey_strategy(cur, indoption)) + if (!_bt_fix_scankey_strategy(inputsk, indoption)) so->qual_ok = false; - memcpy(outkeys, cur, sizeof(ScanKeyData)); + memcpy(so->keyData, inputsk, sizeof(ScanKeyData)); so->numberOfKeys = 1; /* We can mark the qual as required if it's for first index col */ - if (cur->sk_attno == 1) - _bt_mark_scankey_required(outkeys); + if (inputsk->sk_attno == 1) + _bt_mark_scankey_required(so->keyData); if (arrayKeyData) { /* @@ -2631,8 +3959,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * (we'll miss out on the single value array transformation, but * that's not nearly as important when there's only one scan key) */ - Assert(cur->sk_flags & SK_SEARCHARRAY); - Assert(cur->sk_strategy != BTEqualStrategyNumber || + Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY); + Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber || (so->arrayKeys[0].scan_key == 0 && OidIsValid(so->orderProcs[0].fn_oid))); } @@ -2660,12 +3988,12 @@ _bt_preprocess_keys(IndexScanDesc scan) * handle after-last-key processing. Actual exit from the loop is at the * "break" statement below. */ - for (i = 0;; cur++, i++) + for (i = 0;; inputsk++, i++) { if (i < numberOfKeys) { /* Apply indoption to scankey (might change sk_strategy!) */ - if (!_bt_fix_scankey_strategy(cur, indoption)) + if (!_bt_fix_scankey_strategy(inputsk, indoption)) { /* NULL can't be matched, so give up */ so->qual_ok = false; @@ -2677,12 +4005,12 @@ _bt_preprocess_keys(IndexScanDesc scan) * If we are at the end of the keys for a particular attr, finish up * processing and emit the cleaned-up keys. */ - if (i == numberOfKeys || cur->sk_attno != attno) + if (i == numberOfKeys || inputsk->sk_attno != attno) { int priorNumberOfEqualCols = numberOfEqualCols; /* check input keys are correctly ordered */ - if (i < numberOfKeys && cur->sk_attno < attno) + if (i < numberOfKeys && inputsk->sk_attno < attno) elog(ERROR, "btree index keys must be ordered by attribute"); /* @@ -2741,7 +4069,8 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); xform[j].skey = NULL; xform[j].ikey = -1; } @@ -2786,7 +4115,7 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* - * Emit the cleaned-up keys into the outkeys[] array, and then + * Emit the cleaned-up keys into the so->keyData[] array, and then * mark them if they are required. They are required (possibly * only in one direction) if all attrs before this one had "=". */ @@ -2794,7 +4123,7 @@ _bt_preprocess_keys(IndexScanDesc scan) { if (xform[j].skey) { - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); if (arrayKeyData) @@ -2811,19 +4140,19 @@ _bt_preprocess_keys(IndexScanDesc scan) break; /* Re-initialize for new attno */ - attno = cur->sk_attno; + attno = inputsk->sk_attno; memset(xform, 0, sizeof(xform)); } /* check strategy this key's operator corresponds to */ - j = cur->sk_strategy - 1; + j = inputsk->sk_strategy - 1; /* if row comparison, push it directly to the output array */ - if (cur->sk_flags & SK_ROW_HEADER) + if (inputsk->sk_flags & SK_ROW_HEADER) { - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; - memcpy(outkey, cur, sizeof(ScanKeyData)); + memcpy(outkey, inputsk, sizeof(ScanKeyData)); if (arrayKeyData) keyDataMap[new_numberOfKeys - 1] = i; if (numberOfEqualCols == attno - 1) @@ -2837,19 +4166,8 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } - /* - * Does this input scan key require further processing as an array? - */ - if (cur->sk_strategy == InvalidStrategy) - { - /* _bt_preprocess_array_keys marked this array key redundant */ - Assert(arrayKeyData); - Assert(cur->sk_flags & SK_SEARCHARRAY); - continue; - } - - if (cur->sk_strategy == BTEqualStrategyNumber && - (cur->sk_flags & SK_SEARCHARRAY)) + if (inputsk->sk_strategy == BTEqualStrategyNumber && + (inputsk->sk_flags & SK_SEARCHARRAY)) { /* _bt_preprocess_array_keys kept this array key */ Assert(arrayKeyData); @@ -2863,7 +4181,7 @@ _bt_preprocess_keys(IndexScanDesc scan) if (xform[j].skey == NULL) { /* nope, so this scan key wins by default (at least for now) */ - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -2881,7 +4199,7 @@ _bt_preprocess_keys(IndexScanDesc scan) /* * Have to set up array keys */ - if ((cur->sk_flags & SK_SEARCHARRAY)) + if ((inputsk->sk_flags & SK_SEARCHARRAY)) { array = &so->arrayKeys[arrayidx - 1]; orderproc = so->orderProcs + i; @@ -2909,13 +4227,15 @@ _bt_preprocess_keys(IndexScanDesc scan) */ } - if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey, - array, orderproc, &test_result)) + if (_bt_compare_scankey_args(scan, inputsk, inputsk, + xform[j].skey, array, orderproc, + &test_result)) { /* Have all we need to determine redundancy */ if (test_result) { - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); /* * New key is more restrictive, and so replaces old key... @@ -2923,7 +4243,7 @@ _bt_preprocess_keys(IndexScanDesc scan) if (j != (BTEqualStrategyNumber - 1) || !(xform[j].skey->sk_flags & SK_SEARCHARRAY)) { - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -2936,7 +4256,7 @@ _bt_preprocess_keys(IndexScanDesc scan) * scan key. _bt_compare_scankey_args expects us to * always keep arrays (and discard non-arrays). */ - Assert(!(cur->sk_flags & SK_SEARCHARRAY)); + Assert(!(inputsk->sk_flags & SK_SEARCHARRAY)); } } else if (j == (BTEqualStrategyNumber - 1)) @@ -2959,14 +4279,14 @@ _bt_preprocess_keys(IndexScanDesc scan) * even with incomplete opfamilies. _bt_advance_array_keys * depends on this. */ - ScanKey outkey = &outkeys[new_numberOfKeys++]; + ScanKey outkey = &so->keyData[new_numberOfKeys++]; memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); if (arrayKeyData) keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); - xform[j].skey = cur; + xform[j].skey = inputsk; xform[j].ikey = i; xform[j].arrayidx = arrayidx; } @@ -3057,10 +4377,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->scan_key != ikey) return false; - if (array->num_elems <= 0) + if (array->num_elems == 0 || array->num_elems < -1) return false; - if (cur->sk_argument != array->elem_values[array->cur_elem]) + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) return false; if (last_sk_attno > cur->sk_attno) return false; @@ -3135,6 +4456,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool leftnull, rightnull; + /* Handle skip array comparison with IS NOT NULL scan key */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP) + { + /* Shouldn't generate skip array in presence of IS NULL key */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL)); + Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL); + + /* Don't allow skip array to generate IS NULL scan key/element */ + Assert(array->num_elems == -1); + array->null_elem = false; + + /* IS NOT NULL key (could be leftarg or rightarg) now redundant */ + *result = true; + return true; + } + if (leftarg->sk_flags & SK_ISNULL) { Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -3208,6 +4545,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, { /* Can't make the comparison */ *result = false; /* suppress compiler warnings */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); return false; } @@ -3380,13 +4718,6 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) return true; } - if (skey->sk_strategy == InvalidStrategy) - { - /* Already-eliminated array scan key; don't need to fix anything */ - Assert(skey->sk_flags & SK_SEARCHARRAY); - return true; - } - /* Adjust strategy for DESC, if we didn't already */ if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC)) skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy); @@ -3734,6 +5065,21 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, continue; } + /* + * A skip array scan key might be negative/positive infinity. Might + * also be next key/previous key sentinel, which we don't deal with. + */ + if (key->sk_flags & (SK_BT_NEG_INF | SK_BT_POS_INF | + SK_BT_NEXTKEY | SK_BT_PREVKEY)) + { + Assert(key->sk_flags & SK_SEARCHARRAY); + Assert(key->sk_flags & SK_BT_SKIP); + Assert(requiredSameDir); + + *continuescan = false; + return false; + } + /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index e9d4cd60d..96d0d9185 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -114,6 +114,10 @@ btvalidate(Oid opclassoid) case BTOPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case BTSKIPSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index b8b5c147c..a86dbf71b 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -1330,6 +1330,31 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("btree equal image functions must not be cross-type"))); } + else if (member->number == BTSKIPSUPPORT_PROC) + { + if (procform->pronargs != 1 || + procform->proargtypes.values[0] != INTERNALOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must accept type \"internal\""))); + if (procform->prorettype != VOIDOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must return void"))); + + /* + * pg_amproc functions are indexed by (lefttype, righttype), but a + * skip support function doesn't make sense in cross-type + * scenarios. The same opclass opcintype OID is always used for + * lefttype and righttype. Providing a cross-type routine isn't + * sensible. Reject cross-type ALTER OPERATOR FAMILY ... ADD + * FUNCTION 6 statements here. + */ + if (member->lefttype != member->righttype) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must not be cross-type"))); + } } else if (amoid == HASH_AM_OID) { diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index edb09d4e3..e945686c8 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -96,6 +96,7 @@ OBJS = \ rowtypes.o \ ruleutils.o \ selfuncs.o \ + skipsupport.o \ tid.o \ timestamp.o \ trigfuncs.o \ diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 9c854e0e5..ea3d0f4b5 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -34,6 +34,7 @@ #include "utils/date.h" #include "utils/datetime.h" #include "utils/numeric.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" /* @@ -455,6 +456,39 @@ date_sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +date_decrement(Relation rel, Datum existing) +{ + DateADT dexisting = DatumGetDateADT(existing); + + Assert(dexisting > DATEVAL_NOBEGIN); + + return DateADTGetDatum(dexisting - 1); +} + +static Datum +date_increment(Relation rel, Datum existing) +{ + DateADT dexisting = DatumGetDateADT(existing); + + Assert(dexisting < DATEVAL_NOEND); + + return DateADTGetDatum(dexisting + 1); +} + +Datum +date_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = date_decrement; + sksup->increment = date_increment; + sksup->low_elem = DateADTGetDatum(DATEVAL_NOBEGIN); + sksup->high_elem = DateADTGetDatum(DATEVAL_NOEND); + + PG_RETURN_VOID(); +} + Datum date_finite(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 8c6fc80c3..91682edd5 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -83,6 +83,7 @@ backend_sources += files( 'rowtypes.c', 'ruleutils.c', 'selfuncs.c', + 'skipsupport.c', 'tid.c', 'timestamp.c', 'trigfuncs.c', diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5f5d7959d..33b1722df 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6800,6 +6800,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, List *indexBoundQuals; int indexcol; bool eqQualHere; + bool found_skip; bool found_saop; bool found_is_null_op; double num_sa_scans; @@ -6825,6 +6826,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, indexBoundQuals = NIL; indexcol = 0; eqQualHere = false; + found_skip = false; found_saop = false; found_is_null_op = false; num_sa_scans = 1; @@ -6833,15 +6835,38 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, IndexClause *iclause = lfirst_node(IndexClause, lc); ListCell *lc2; + /* + * XXX For now we just cost skip scans via generic rules: make a + * uniform assumption that there will be 10 primitive index scans per + * skipped attribute, relying on the "1/3 of all index pages" cap that + * this costing has used since Postgres 17. Also assume that skipping + * won't take place for an index that has fewer than 100 pages. + * + * The current approach to costing leaves much to be desired, but is + * at least better than nothing at all (keeping the code as it is on + * HEAD just makes testing and review inconvenient). + */ if (indexcol != iclause->indexcol) { /* Beginning of a new column's quals */ if (!eqQualHere) - break; /* done if no '=' qual for indexcol */ + { + found_skip = true; /* skip when no '=' qual for indexcol */ + if (index->pages < 100) + break; + num_sa_scans += 10; + } eqQualHere = false; indexcol++; if (indexcol != iclause->indexcol) - break; /* no quals at all for indexcol */ + { + /* no quals at all for indexcol */ + found_skip = true; + if (index->pages < 100) + break; + num_sa_scans += 10 * (iclause->indexcol - indexcol); + continue; + } } /* Examine each indexqual associated with this index clause */ @@ -6914,6 +6939,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, if (index->unique && indexcol == index->nkeycolumns - 1 && eqQualHere && + !found_skip && !found_saop && !found_is_null_op) numIndexTuples = 1.0; diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c new file mode 100644 index 000000000..9665e4985 --- /dev/null +++ b/src/backend/utils/adt/skipsupport.c @@ -0,0 +1,54 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.c + * Support routines for B-Tree skip scans. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/skipsupport.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/nbtree.h" +#include "utils/lsyscache.h" +#include "utils/skipsupport.h" + +/* + * Fill in SkipSupport given an operator class (opfamily + opcintype). + * + * On success, returns true, and initializes all SkipSupport fields for + * caller. Otherwise returns false, indicating that operator class has no + * skip support function. + */ +bool +PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse, + SkipSupport sksup) +{ + Oid skipSupportFunction; + + /* Look for a skip support function */ + skipSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype, + BTSKIPSUPPORT_PROC); + if (!OidIsValid(skipSupportFunction)) + return false; + + OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup)); + + if (reverse) + { + Datum low_elem = sksup->low_elem; + + sksup->low_elem = sksup->high_elem; + sksup->high_elem = low_elem; + } + + return true; +} diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 45eb1b2fe..a9222f896 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -13,12 +13,15 @@ #include "postgres.h" +#include + #include "common/hashfn.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" #include "utils/guc.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" #include "utils/uuid.h" @@ -390,6 +393,68 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) return res; } +static Datum +uuid_decrement(Relation rel, Datum existing) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] > 0) + { + uuid->data[i]--; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = UCHAR_MAX; + } + + Assert(false); + + return UUIDPGetDatum(uuid); +} + +static Datum +uuid_increment(Relation rel, Datum existing) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] < UCHAR_MAX) + { + uuid->data[i]++; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = 0; + } + + Assert(false); + + return UUIDPGetDatum(uuid); +} + +Datum +uuid_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + pg_uuid_t *uuid_min = palloc(UUID_LEN); + pg_uuid_t *uuid_max = palloc(UUID_LEN); + + memset(uuid_min->data, 0x00, UUID_LEN); + memset(uuid_max->data, 0xFF, UUID_LEN); + + sksup->decrement = uuid_decrement; + sksup->increment = uuid_increment; + sksup->low_elem = UUIDPGetDatum(uuid_min); + sksup->high_elem = UUIDPGetDatum(uuid_max); + + PG_RETURN_VOID(); +} + /* hash index support */ Datum uuid_hash(PG_FUNCTION_ARGS) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 630ed0f16..6fc3ca1a7 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/slru.h" #include "access/toast_compression.h" #include "access/twophase.h" @@ -1702,6 +1703,17 @@ struct config_bool ConfigureNamesBool[] = }, #endif + /* XXX Remove before commit */ + { + {"skipscan_skipsupport_enabled", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_skipsupport_enabled, + true, + NULL, NULL, NULL + }, + { {"integer_datetimes", PGC_INTERNAL, PRESET_OPTIONS, gettext_noop("Shows whether datetimes are integer based."), @@ -3525,6 +3537,17 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* XXX Remove before commit */ + { + {"skipscan_prefix_cols", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_prefix_cols, + INDEX_MAX_KEYS, 0, INDEX_MAX_KEYS, + NULL, NULL, NULL + }, + { /* Can't be set in postgresql.conf */ {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 2b3997988..9662fb2ba 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -583,6 +583,19 @@ options(relopts local_relopts *) returns + + skipsupport + + + Optionally, a btree operator family may provide a skip + support function, registered under support function + number 6. These functions allow the B-tree code to more efficiently + navigate the index structure via an index skip scan. The + APIs involved in this are defined in + src/include/utils/skipsupport.h. + + + diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 22d8ad1aa..f17dd3456 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -461,6 +461,13 @@ 5 + + + Return the addresses of C-callable skip support function(s) + (optional) + + 6 + @@ -1056,7 +1063,8 @@ DEFAULT FOR TYPE int8 USING btree FAMILY integer_ops AS FUNCTION 1 btint8cmp(int8, int8) , FUNCTION 2 btint8sortsupport(internal) , FUNCTION 3 in_range(int8, int8, int8, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint8skipsupport(internal); CREATE OPERATOR CLASS int4_ops DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS @@ -1069,7 +1077,8 @@ DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS FUNCTION 1 btint4cmp(int4, int4) , FUNCTION 2 btint4sortsupport(internal) , FUNCTION 3 in_range(int4, int4, int4, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint4skipsupport(internal); CREATE OPERATOR CLASS int2_ops DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS @@ -1082,7 +1091,8 @@ DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS FUNCTION 1 btint2cmp(int2, int2) , FUNCTION 2 btint2sortsupport(internal) , FUNCTION 3 in_range(int2, int2, int2, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint2skipsupport(internal); ALTER OPERATOR FAMILY integer_ops USING btree ADD -- cross-type comparisons int8 vs int2 diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index ae54cb254..8b6b775c1 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -362,9 +362,9 @@ ERROR: invalid operator number 0, must be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ERROR: operator argument types must be specified in ALTER OPERATOR FAMILY ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ERROR: invalid function number 0, must be between 1 and 5 -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 -ERROR: invalid function number 6, must be between 1 and 5 +ERROR: invalid function number 0, must be between 1 and 6 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 +ERROR: invalid function number 7, must be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 3bbe4c5f9..a8d5be6c1 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -5138,9 +5138,10 @@ List of access methods btree | uuid_ops | uuid | uuid | 1 | uuid_cmp btree | uuid_ops | uuid | uuid | 2 | uuid_sortsupport btree | uuid_ops | uuid | uuid | 4 | btequalimage + btree | uuid_ops | uuid | uuid | 6 | uuid_skipsupport hash | uuid_ops | uuid | uuid | 1 | uuid_hash hash | uuid_ops | uuid | uuid | 2 | uuid_hash_extended -(5 rows) +(6 rows) -- check \dconfig set work_mem = 10240; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d..4246afefd 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -310,7 +310,7 @@ ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 6 < (int4, int2); -- ope ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 0 < (int4, int2); -- operator number should be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 635e6d6e2..58dec6a16 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -218,6 +218,7 @@ BTScanPos BTScanPosData BTScanPosItem BTShared +BTSkipPreproc BTSortArrayContext BTSpool BTStack @@ -2653,6 +2654,8 @@ SingleBoundSortItem SinglePartitionSpec Size SkipPages +SkipSupport +SkipSupportData SlabBlock SlabContext SlabSlot -- 2.45.2