From e4b75773628cd492edd12addeda8cbbac618b749 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 16 Apr 2024 13:21:36 -0400 Subject: [PATCH v5 3/3] Add skip scan to nbtree. Skip scan allows nbtree index scans to efficiently use a composite index on an index (a, b) for queries with a predicate such as "WHERE b = 5". This is useful in cases where the total number of distinct values in the column 'a' is reasonably small (think hundreds, possibly thousands). In effect, a skip scan treats the composite index on (a, b) as if it was a series of disjunct subindexes -- one subindex per distinct 'a' value. We exhaustively "search every subindex" using a qual that behaves like "WHERE a = ANY() AND b = 5". The design of skip scan works by extended the design for arrays established by commit 5bf748b8. "Skip arrays" generate their array values procedurally and on-demand, but otherwise work just like arrays used by SAOPs. B-Tree operator classes on discrete types can now optionally provide a skip support routine. This is used to generate the next array element value by incrementing the current value (or by decrementing, in the case of backwards scans). When the opclass lacks a skip support routine, we use sentinel next-key values instead. Adding skip support makes skip scans more efficient in cases where there is naturally a good chance that the very next value will find matching tuples. For example, during an index scan with a leading "sales_date" attribute, there is a decent chance that a scan that just finished returning tuples matching "sales_date = '2024-06-01' and id = 5000" will find later tuples matching "sales_date = '2024-06-02' and id = 5000". It is to our advantage to skip straight to the relevant "id = 5000" leaf page, totally avoiding reading earlier "sales_date = '2024-06-02'" leaf pages. Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Reviewed-By: Aleksander Alekseev Discussion: https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP+G4bw@mail.gmail.com --- src/include/access/nbtree.h | 25 +- src/include/catalog/pg_amproc.dat | 16 + src/include/catalog/pg_proc.dat | 24 + src/include/utils/skipsupport.h | 109 ++ src/backend/access/nbtree/nbtcompare.c | 261 ++++ src/backend/access/nbtree/nbtsearch.c | 80 +- src/backend/access/nbtree/nbtutils.c | 1456 +++++++++++++++++-- src/backend/access/nbtree/nbtvalidate.c | 4 + src/backend/commands/opclasscmds.c | 25 + src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/date.c | 44 + src/backend/utils/adt/meson.build | 1 + src/backend/utils/adt/selfuncs.c | 30 +- src/backend/utils/adt/skipsupport.c | 60 + src/backend/utils/adt/uuid.c | 67 + src/backend/utils/misc/guc_tables.c | 23 + doc/src/sgml/btree.sgml | 13 + doc/src/sgml/indices.sgml | 40 +- doc/src/sgml/xindex.sgml | 16 +- src/test/regress/expected/alter_generic.out | 6 +- src/test/regress/expected/psql.out | 3 +- src/test/regress/sql/alter_generic.sql | 2 +- src/tools/pgindent/typedefs.list | 3 + 23 files changed, 2175 insertions(+), 134 deletions(-) create mode 100644 src/include/utils/skipsupport.h create mode 100644 src/backend/utils/adt/skipsupport.c diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 5f366323c..13a650c3d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -24,6 +24,7 @@ #include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/shm_toc.h" +#include "utils/skipsupport.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; @@ -709,7 +710,8 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup) #define BTINRANGE_PROC 3 #define BTEQUALIMAGE_PROC 4 #define BTOPTIONS_PROC 5 -#define BTNProcs 5 +#define BTSKIPSUPPORT_PROC 6 +#define BTNProcs 6 /* * We need to be able to tell the difference between read and write @@ -1031,10 +1033,22 @@ typedef BTScanPosData *BTScanPos; /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { + /* fields used by both kinds of array (standard arrays and skip arrays) */ int scan_key; /* index of associated key in keyData */ + int num_elems; /* number of elems (-1 for skip array) */ + + /* fields for standard arrays that store elements in memory */ int cur_elem; /* index of current element in elem_values */ - int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ + + /* fields for skip arrays, which generate their elements procedurally */ + bool use_sksup; /* sksup set to valid routine? */ + bool null_elem; /* lowest/highest element actually NULL? */ + SkipSupportData sksup; /* opclass skip scan support, when use_sksup */ + ScanKey low_compare; /* array's > or >= lower bound */ + ScanKey high_compare; /* array's < or <= upper bound */ + FmgrInfo order_low; /* low_compare's ORDER proc */ + FmgrInfo order_high; /* high_compare's ORDER proc */ } BTArrayKeyInfo; typedef struct BTScanOpaqueData @@ -1124,6 +1138,9 @@ typedef struct BTReadPageState */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_SKIP 0x00040000 /* skip array, for skip scan */ +#define SK_BT_NEGPOSINF 0x00080000 /* no sk_argument, -inf/+inf key */ +#define SK_BT_NEXTPRIOR 0x00100000 /* sk_argument is next/prior key */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1160,6 +1177,10 @@ typedef struct BTOptions #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 +/* GUC parameters (just a temporary convenience for reviewers) */ +extern PGDLLIMPORT int skipscan_prefix_cols; +extern PGDLLIMPORT bool skipscan_skipsupport_enabled; + /* * external entry points for btree, in nbtree.c */ diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index f639c3a6a..2a8f6f3f1 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -21,6 +21,8 @@ amprocrighttype => 'bit', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '1', amproc => 'btboolcmp' }, +{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', + amprocrighttype => 'bool', amprocnum => '6', amproc => 'btboolskipsupport' }, { amprocfamily => 'btree/bool_ops', amproclefttype => 'bool', amprocrighttype => 'bool', amprocnum => '4', amproc => 'btequalimage' }, { amprocfamily => 'btree/bpchar_ops', amproclefttype => 'bpchar', @@ -41,12 +43,16 @@ amprocrighttype => 'char', amprocnum => '1', amproc => 'btcharcmp' }, { amprocfamily => 'btree/char_ops', amproclefttype => 'char', amprocrighttype => 'char', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/char_ops', amproclefttype => 'char', + amprocrighttype => 'char', amprocnum => '6', amproc => 'btcharskipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '1', amproc => 'date_cmp' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '2', amproc => 'date_sortsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', + amprocrighttype => 'date', amprocnum => '6', amproc => 'date_skipsupport' }, { amprocfamily => 'btree/datetime_ops', amproclefttype => 'date', amprocrighttype => 'timestamp', amprocnum => '1', amproc => 'date_cmp_timestamp' }, @@ -122,6 +128,8 @@ amprocrighttype => 'int2', amprocnum => '2', amproc => 'btint2sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', + amprocrighttype => 'int2', amprocnum => '6', amproc => 'btint2skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint24cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int2', @@ -141,6 +149,8 @@ amprocrighttype => 'int4', amprocnum => '2', amproc => 'btint4sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', + amprocrighttype => 'int4', amprocnum => '6', amproc => 'btint4skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', amprocrighttype => 'int8', amprocnum => '1', amproc => 'btint48cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int4', @@ -160,6 +170,8 @@ amprocrighttype => 'int8', amprocnum => '2', amproc => 'btint8sortsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int8', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', + amprocrighttype => 'int8', amprocnum => '6', amproc => 'btint8skipsupport' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint84cmp' }, { amprocfamily => 'btree/integer_ops', amproclefttype => 'int8', @@ -193,6 +205,8 @@ amprocrighttype => 'oid', amprocnum => '2', amproc => 'btoidsortsupport' }, { amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', amprocrighttype => 'oid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid', + amprocrighttype => 'oid', amprocnum => '6', amproc => 'btoidskipsupport' }, { amprocfamily => 'btree/oidvector_ops', amproclefttype => 'oidvector', amprocrighttype => 'oidvector', amprocnum => '1', amproc => 'btoidvectorcmp' }, @@ -261,6 +275,8 @@ amprocrighttype => 'uuid', amprocnum => '2', amproc => 'uuid_sortsupport' }, { amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', amprocrighttype => 'uuid', amprocnum => '4', amproc => 'btequalimage' }, +{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid', + amprocrighttype => 'uuid', amprocnum => '6', amproc => 'uuid_skipsupport' }, { amprocfamily => 'btree/record_ops', amproclefttype => 'record', amprocrighttype => 'record', amprocnum => '1', amproc => 'btrecordcmp' }, { amprocfamily => 'btree/record_image_ops', amproclefttype => 'record', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index d36f6001b..2dec83363 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1004,18 +1004,27 @@ { oid => '3129', descr => 'sort support', proname => 'btint2sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2sortsupport' }, +{ oid => '9290', descr => 'skip support', + proname => 'btint2skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint2skipsupport' }, { oid => '351', descr => 'less-equal-greater', proname => 'btint4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int4 int4', prosrc => 'btint4cmp' }, { oid => '3130', descr => 'sort support', proname => 'btint4sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4sortsupport' }, +{ oid => '9291', descr => 'skip support', + proname => 'btint4skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint4skipsupport' }, { oid => '842', descr => 'less-equal-greater', proname => 'btint8cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'int8 int8', prosrc => 'btint8cmp' }, { oid => '3131', descr => 'sort support', proname => 'btint8sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8sortsupport' }, +{ oid => '9292', descr => 'skip support', + proname => 'btint8skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btint8skipsupport' }, { oid => '354', descr => 'less-equal-greater', proname => 'btfloat4cmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'float4 float4', prosrc => 'btfloat4cmp' }, @@ -1034,12 +1043,18 @@ { oid => '3134', descr => 'sort support', proname => 'btoidsortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidsortsupport' }, +{ oid => '9293', descr => 'skip support', + proname => 'btoidskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btoidskipsupport' }, { oid => '404', descr => 'less-equal-greater', proname => 'btoidvectorcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'oidvector oidvector', prosrc => 'btoidvectorcmp' }, { oid => '358', descr => 'less-equal-greater', proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'char char', prosrc => 'btcharcmp' }, +{ oid => '9294', descr => 'skip support', + proname => 'btcharskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btcharskipsupport' }, { oid => '359', descr => 'less-equal-greater', proname => 'btnamecmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'name name', prosrc => 'btnamecmp' }, @@ -2214,6 +2229,9 @@ { oid => '3136', descr => 'sort support', proname => 'date_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_sortsupport' }, +{ oid => '9295', descr => 'skip support', + proname => 'date_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'date_skipsupport' }, { oid => '4133', descr => 'window RANGE support', proname => 'in_range', prorettype => 'bool', proargtypes => 'date date interval bool bool', @@ -4401,6 +4419,9 @@ { oid => '1693', descr => 'less-equal-greater', proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'bool bool', prosrc => 'btboolcmp' }, +{ oid => '9296', descr => 'skip support', + proname => 'btboolskipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'btboolskipsupport' }, { oid => '1688', descr => 'hash', proname => 'time_hash', prorettype => 'int4', proargtypes => 'time', @@ -9231,6 +9252,9 @@ { oid => '3300', descr => 'sort support', proname => 'uuid_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_sortsupport' }, +{ oid => '9297', descr => 'skip support', + proname => 'uuid_skipsupport', prorettype => 'void', + proargtypes => 'internal', prosrc => 'uuid_skipsupport' }, { oid => '2961', descr => 'I/O', proname => 'uuid_recv', prorettype => 'uuid', proargtypes => 'internal', prosrc => 'uuid_recv' }, diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h new file mode 100644 index 000000000..d91390fc6 --- /dev/null +++ b/src/include/utils/skipsupport.h @@ -0,0 +1,109 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.h + * Support routines for B-Tree skip scan. + * + * B-Tree operator classes for discrete types can optionally provide a support + * function for skipping. This is used during skip scans. + * + * A B-tree operator class that implements skip support provides B-tree index + * scans with a way of enumerating and iterating through every possible value + * from the domain of indexable values. This gives scans a way to determine + * the next value in line for a given skip array/scan key/skipped attribute. + * This happens at the point where the scan determines that another primitive + * index scan is required. The next value is used (in combination with at + * least one additional lower-order non-skip key, taken from the SQL query) to + * relocate the scan, skipping over many irrelevant leaf pages in the process. + * + * Skip support generally works best with discrete types such as integer, + * date, and boolean; types where there is a decent chance that indexes will + * contain contiguous values (given a leading attributes using the opclass). + * When gaps/discontinuities are naturally rare (e.g., a leading identity + * column in a composite index, a date column preceding a product_id column), + * then it makes sense for skip scans to optimistically assume that the next + * distinct indexable value will find directly matching index tuples. + * + * The B-Tree code can fall back on next-key sentinel values for any opclass + * that doesn't provide its own skip support function. There is no point in + * providing skip support unless the next indexed key value is often the next + * indexable value (at least with some workloads). Opclasses where that never + * works out in practice should just rely on the B-Tree AM's generic next-key + * fallback strategy. Opclasses where adding skip support is infeasible or + * hard (e.g., an opclass for a continuous type) can also use the fallback. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/skipsupport.h + * + *------------------------------------------------------------------------- + */ +#ifndef SKIPSUPPORT_H +#define SKIPSUPPORT_H + +#include "utils/relcache.h" + +typedef struct SkipSupportData *SkipSupport; +typedef Datum (*SkipSupportIncDec) (Relation rel, + Datum existing, + bool *overflow); + +/* + * State/callbacks used by skip arrays to procedurally generate elements. + * + * A BTSKIPSUPPORT_PROC function must set each and every field when called. + * If an opclass can only set some of the fields, then it cannot safely + * provide a skip support routine. + */ +typedef struct SkipSupportData +{ + /* + * low_elem and high_elem must be set with the lowest and highest possible + * values from the domain of indexable values (assuming standard ascending + * order). This helps the B-Tree code with finding its initial position + * at the leaf level (during the skip scan's first primitive index scan). + * In other words, it gives the B-Tree code a useful value to start from, + * before any data has been read from the index. + * + * low_elem and high_elem are also used by skip scans to determine when + * they've reached the final possible value (in the current direction). + * It's typical for the scan to run out of leaf pages before it runs out + * of unscanned indexable values, but it's still useful for the scan to + * have a way to recognize when it has reached the last possible value + * (this saves us a useless probe that just lands on the final leaf page). + */ + Datum low_elem; /* lowest sorting/leftmost non-NULL value */ + Datum high_elem; /* highest sorting/rightmost non-NULL value */ + + /* + * Decrement/increment functions. + * + * Returns a decremented/incremented copy of caller's existing datum, + * allocated in caller's memory context (in the case of pass-by-reference + * types). It's not okay for these functions to leak any memory. + * + * Both decrement and increment callbacks are guaranteed to never be + * called with a NULL "existing" arg. + * + * When the decrement function (or increment function) is called with a + * value that already matches low_elem (or high_elem), function must set + * the *overflow argument. The return value is undefined, and the B-Tree + * code is entitled to assume that no memory will have been allocated. + * + * The B-Tree skip scan caller's "existing" datum is often just a straight + * copy of a value from an index tuple. Operator classes must be liberal + * in accepting every possible representational variation within the + * underlying data type. On the other hand, opclasses are _not_ expected + * to preserve any information that doesn't affect how datums are sorted + * (e.g., skip support for a fixed precision numeric type isn't required + * to preserve datum display scale). + */ + SkipSupportIncDec decrement; + SkipSupportIncDec increment; +} SkipSupportData; + +extern bool PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, + bool reverse, SkipSupport sksup); + +#endif /* SKIPSUPPORT_H */ diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 1c72867c8..deb387453 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -58,6 +58,7 @@ #include #include "utils/fmgrprotos.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #ifdef STRESS_SORT_INT_MIN @@ -78,6 +79,49 @@ btboolcmp(PG_FUNCTION_ARGS) PG_RETURN_INT32((int32) a - (int32) b); } +static Datum +bool_decrement(Relation rel, Datum existing, bool *underflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == false) + { + *underflow = true; + return 0; + } + + *underflow = false; + return BoolGetDatum(bexisting - 1); +} + +static Datum +bool_increment(Relation rel, Datum existing, bool *overflow) +{ + bool bexisting = DatumGetBool(existing); + + if (bexisting == true) + { + *overflow = true; + return 0; + } + + *overflow = false; + return BoolGetDatum(bexisting + 1); +} + +Datum +btboolskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = bool_decrement; + sksup->increment = bool_increment; + sksup->low_elem = BoolGetDatum(false); + sksup->high_elem = BoolGetDatum(true); + + PG_RETURN_VOID(); +} + Datum btint2cmp(PG_FUNCTION_ARGS) { @@ -105,6 +149,49 @@ btint2sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int2_decrement(Relation rel, Datum existing, bool *underflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MIN) + { + *underflow = true; + return 0; + } + + *underflow = false; + return Int16GetDatum(iexisting - 1); +} + +static Datum +int2_increment(Relation rel, Datum existing, bool *overflow) +{ + int16 iexisting = DatumGetInt16(existing); + + if (iexisting == PG_INT16_MAX) + { + *overflow = true; + return 0; + } + + *overflow = false; + return Int16GetDatum(iexisting + 1); +} + +Datum +btint2skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int2_decrement; + sksup->increment = int2_increment; + sksup->low_elem = Int16GetDatum(PG_INT16_MIN); + sksup->high_elem = Int16GetDatum(PG_INT16_MAX); + + PG_RETURN_VOID(); +} + Datum btint4cmp(PG_FUNCTION_ARGS) { @@ -128,6 +215,49 @@ btint4sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int4_decrement(Relation rel, Datum existing, bool *underflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MIN) + { + *underflow = true; + return 0; + } + + *underflow = false; + return Int32GetDatum(iexisting - 1); +} + +static Datum +int4_increment(Relation rel, Datum existing, bool *overflow) +{ + int32 iexisting = DatumGetInt32(existing); + + if (iexisting == PG_INT32_MAX) + { + *overflow = true; + return 0; + } + + *overflow = false; + return Int32GetDatum(iexisting + 1); +} + +Datum +btint4skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int4_decrement; + sksup->increment = int4_increment; + sksup->low_elem = Int32GetDatum(PG_INT32_MIN); + sksup->high_elem = Int32GetDatum(PG_INT32_MAX); + + PG_RETURN_VOID(); +} + Datum btint8cmp(PG_FUNCTION_ARGS) { @@ -171,6 +301,49 @@ btint8sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +int8_decrement(Relation rel, Datum existing, bool *underflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MIN) + { + *underflow = true; + return 0; + } + + *underflow = false; + return Int64GetDatum(iexisting - 1); +} + +static Datum +int8_increment(Relation rel, Datum existing, bool *overflow) +{ + int64 iexisting = DatumGetInt64(existing); + + if (iexisting == PG_INT64_MAX) + { + *overflow = true; + return 0; + } + + *overflow = false; + return Int64GetDatum(iexisting + 1); +} + +Datum +btint8skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = int8_decrement; + sksup->increment = int8_increment; + sksup->low_elem = Int64GetDatum(PG_INT64_MIN); + sksup->high_elem = Int64GetDatum(PG_INT64_MAX); + + PG_RETURN_VOID(); +} + Datum btint48cmp(PG_FUNCTION_ARGS) { @@ -292,6 +465,49 @@ btoidsortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +oid_decrement(Relation rel, Datum existing, bool *underflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == InvalidOid) + { + *underflow = true; + return 0; + } + + *underflow = false; + return ObjectIdGetDatum(oexisting - 1); +} + +static Datum +oid_increment(Relation rel, Datum existing, bool *overflow) +{ + Oid oexisting = DatumGetObjectId(existing); + + if (oexisting == OID_MAX) + { + *overflow = true; + return 0; + } + + *overflow = false; + return ObjectIdGetDatum(oexisting + 1); +} + +Datum +btoidskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = oid_decrement; + sksup->increment = oid_increment; + sksup->low_elem = ObjectIdGetDatum(InvalidOid); + sksup->high_elem = ObjectIdGetDatum(OID_MAX); + + PG_RETURN_VOID(); +} + Datum btoidvectorcmp(PG_FUNCTION_ARGS) { @@ -325,3 +541,48 @@ btcharcmp(PG_FUNCTION_ARGS) /* Be careful to compare chars as unsigned */ PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b)); } + +static Datum +char_decrement(Relation rel, Datum existing, bool *underflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == 0) + { + *underflow = true; + return 0; + } + + *underflow = false; + return CharGetDatum((uint8) cexisting - 1); +} + +static Datum +char_increment(Relation rel, Datum existing, bool *overflow) +{ + uint8 cexisting = UInt8GetDatum(existing); + + if (cexisting == UCHAR_MAX) + { + *overflow = true; + return 0; + } + + *overflow = false; + return CharGetDatum((uint8) cexisting + 1); +} + +Datum +btcharskipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = char_decrement; + sksup->increment = char_increment; + + /* btcharcmp compares chars as unsigned */ + sksup->low_elem = UInt8GetDatum(0); + sksup->high_elem = UInt8GetDatum(UCHAR_MAX); + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 88f4ef7b7..05b98efc2 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -880,7 +880,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Buffer buf; BTStack stack; OffsetNumber offnum; - StrategyNumber strat; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; @@ -1022,6 +1021,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanKey chosen; ScanKey impliesNN; ScanKey cur; + int ikey = 0, + ichosen = 0; /* * chosen is the so-far-chosen key for the current attribute, if any. @@ -1042,6 +1043,53 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* + * Conceptually, skip arrays consist of array elements whose + * values are generated procedurally and on demand. We need + * special handling for that here. + * + * We must interpret various sentinel values to generate an + * insertion scan key. This is only actually needed for index + * attributes whose input opclass lacks a skip support routine + * (when skip support is available we'll always be able to + * generate true array element datum values instead). + */ + if (chosen && (chosen->sk_flags & SK_BT_NEGPOSINF)) + { + ScanKey origchosen = chosen; + BTArrayKeyInfo *array = NULL; + + for (; ikey < so->numArrayKeys; ikey++) + { + array = &so->arrayKeys[ikey]; + if (array->scan_key == ichosen) + break; + } + + /* use array's inequality key in startKeys[] */ + if (ScanDirectionIsForward(dir)) + chosen = array->low_compare; + else + chosen = array->high_compare; + + if (!chosen && !array->null_elem) + { + /* + * Array doesn't have any explicit low_compare or + * high_compare that we can use (given the current + * scan direction). The array does not include a NULL + * element (to generate an IS NULL qual), though, so + * we might need to deduce a NOT NULL key to skip over + * any NULLs. Prepare for that. + * + * Note: this is also how we handle an explicit NOT + * NULL key that preprocessing folded into the skip + * array. + */ + impliesNN = origchosen; + } + } + /* * Done looking at keys for curattr. If we didn't find a * usable boundary key, see if we can deduce a NOT NULL key. @@ -1075,16 +1123,34 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) break; startKeys[keysz++] = chosen; + /* + * Skip arrays can also use a sk_argument which is marked + * "next key". This is another sentinel array element value + * requiring special handling here by us. As with -inf/+inf + * sentinels, there cannot be any exact non-pivot matches. + */ + if (chosen->sk_flags & SK_BT_NEXTPRIOR) + { + /* + * Adjust strat_total, so that our = key gets treated like + * a > key (or like a < key) + */ + if (ScanDirectionIsForward(dir)) + strat_total = BTGreaterStrategyNumber; + else + strat_total = BTLessStrategyNumber; + break; + } + /* * Adjust strat_total, and quit if we have stored a > or < * key. */ - strat = chosen->sk_strategy; - if (strat != BTEqualStrategyNumber) + if (chosen->sk_strategy != BTEqualStrategyNumber) { - strat_total = strat; - if (strat == BTGreaterStrategyNumber || - strat == BTLessStrategyNumber) + strat_total = chosen->sk_strategy; + if (chosen->sk_strategy == BTGreaterStrategyNumber || + chosen->sk_strategy == BTLessStrategyNumber) break; } @@ -1103,6 +1169,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) curattr = cur->sk_attno; chosen = NULL; impliesNN = NULL; + ichosen = -1; } /* @@ -1127,6 +1194,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTEqualStrategyNumber: /* override any non-equality choice */ chosen = cur; + ichosen = i; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7fa977a62..fa046c550 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -29,9 +29,37 @@ #include "utils/memutils.h" #include "utils/rel.h" +/* + * GUC parameters (temporary convenience for reviewers). + * + * To disable all skipping, set skipscan_prefix_cols=0. Otherwise set it to + * the attribute number that you wish to make the last attribute number that + * we can add a skip scan key for. For example, skipscan_prefix_cols=1 makes + * an index scan with qual "WHERE b = 1 AND c > 42" generate a skip scan key + * on the column 'a' (which is attnum 1) only, preventing us from adding one + * for the column 'c' (and so 'c' will still have an inequality scan key, + * required in only one direction -- 'c' won't be output as a "range" skip + * key/array). + */ +int skipscan_prefix_cols = INDEX_MAX_KEYS; + +/* + * skipscan_skipsupport_enabled can be used to avoid using opclass skip + * support routines. This can be used to quantify the peformance benefit that + * comes from having dedicated skip support, with a given test query. + */ +bool skipscan_skipsupport_enabled = true; + #define LOOK_AHEAD_REQUIRED_RECHECKS 3 #define LOOK_AHEAD_DEFAULT_DISTANCE 5 +typedef struct BTSkipPreproc +{ + SkipSupportData sksup; /* opclass skip scan support (optional) */ + bool use_sksup; /* sksup set to valid routine? */ + Oid eq_op; /* InvalidOid means don't skip */ +} BTSkipPreproc; + typedef struct BTSortArrayContext { FmgrInfo *sortproc; @@ -64,15 +92,38 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan, bool *qual_ok); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); +static int _bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts); +static bool _bt_skipsupport(Relation rel, int add_skip_attno, + BTSkipPreproc *skipatts); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur); + Datum arrdatum, bool arrnull, + ScanKey cur); +static void _bt_array_preproc_shrink(ScanKey arraysk, ScanKey skey, + FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); +static bool _bt_skip_preproc_shrink(IndexScanDesc scan, ScanKey arraysk, + ScanKey skey, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok); static int _bt_binsrch_array_skey(FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); +static void _bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static void _bt_scankey_set_low_or_high(Relation rel, ScanKey skey, + BTArrayKeyInfo *array, bool low_not_high); +static void _bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull); +static void _bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static void _bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static bool _bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); +static bool _bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, @@ -258,11 +309,19 @@ _bt_freestack(BTStack stack) * preprocessing steps are complete. This will convert the scan key offset * references into references to the scan's so->keyData[] output scan keys. * + * We're also responsible for generating skip arrays (and their associated + * scan keys) here. This enables skip scan. We do this for index attributes + * that initially lacked an equality condition within scan->keyData[], iff + * doing so allows a later scan key (that was passed to us in scan->keyData[]) + * to be marked required by later preprocessing on output. + * _bt_decide_skipatts decides which attributes receive skip arrays. + * * Caller must pass *numberOfKeys to give us a way to change the number of * input scan keys (our output is caller's input). The returned array can be * smaller than scan->keyData[] when we eliminated a redundant array scan key - * (redundant with some other array scan key, for the same attribute). Caller - * uses this to allocate so->keyData[] for the current btrescan. + * (redundant with some other array scan key, for the same attribute). It can + * also be larger when we added a skip array/skip scan key. Caller uses this + * to allocate so->keyData[] for the current btrescan. * * Note: the reason we need to return a temp scan key array, rather than just * scribbling on scan->keyData, is that callers are permitted to call btrescan @@ -275,8 +334,11 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) Relation rel = scan->indexRelation; int numArrayKeyData = scan->numberOfKeys; int16 *indoption = rel->rd_indoption; + BTSkipPreproc skipatts[INDEX_MAX_KEYS]; int numArrayKeys, + numSkipArrayKeys, output_ikey = 0; + AttrNumber attno_skip = 1; int origarrayatt = InvalidAttrNumber, origarraykey = -1; Oid origelemtype = InvalidOid; @@ -286,7 +348,10 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) Assert(scan->numberOfKeys); - /* Quick check to see if there are any array keys */ + /* + * Quick check to see if there are any array keys, or any missing keys we + * can generate a "skip scan" array key for ourselves + */ numArrayKeys = 0; for (int i = 0; i < scan->numberOfKeys; i++) { @@ -304,6 +369,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) } } + /* Consider generating skip arrays, and associated equality scan keys */ + numSkipArrayKeys = _bt_decide_skipatts(scan, skipatts); + if (numSkipArrayKeys) + { + /* At least one skip array scan key must be added to arrayKeyData[] */ + numArrayKeys += numSkipArrayKeys; + /* output scan key buffer allocation needs space for skip scan keys */ + numArrayKeyData += numSkipArrayKeys; + } + /* Quit if nothing to do. */ if (numArrayKeys == 0) return NULL; @@ -330,7 +405,12 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) /* Allocate space for ORDER procs used to help _bt_checkkeys */ so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo)); - /* Now process each array key */ + /* + * Process each array key, and generate skip arrays as needed. Also copy + * every scan->keyData[] input scan key (whether it's an array or not) + * into the arrayKeyData array we'll return to our caller (barring any + * array scan keys that we could eliminate early through array merging). + */ numArrayKeys = 0; for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++) { @@ -348,6 +428,73 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) int num_nonnulls; int j; + /* Create a skip array and scan key where indicated by skipatts */ + while (numSkipArrayKeys && + attno_skip <= scan->keyData[input_ikey].sk_attno) + { + Oid opcintype = rel->rd_opcintype[attno_skip - 1]; + Oid collation = rel->rd_indcollation[attno_skip - 1]; + Oid eq_op = skipatts[attno_skip - 1].eq_op; + RegProcedure cmp_proc; + + if (!OidIsValid(eq_op)) + { + /* won't skip using this attribute */ + attno_skip++; + continue; + } + + cmp_proc = get_opcode(eq_op); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op); + + cur = &arrayKeyData[output_ikey]; + Assert(attno_skip <= scan->keyData[input_ikey].sk_attno); + ScanKeyEntryInitialize(cur, + SK_SEARCHARRAY | SK_BT_SKIP, /* flags */ + attno_skip, /* skipped att number */ + BTEqualStrategyNumber, /* equality strategy */ + InvalidOid, /* opclass input subtype */ + collation, /* index column's collation */ + cmp_proc, /* equality operator's proc */ + (Datum) 0); /* constant */ + + /* Initialize array fields */ + so->arrayKeys[numArrayKeys].scan_key = output_ikey; + so->arrayKeys[numArrayKeys].num_elems = -1; + so->arrayKeys[numArrayKeys].cur_elem = 0; + so->arrayKeys[numArrayKeys].elem_values = NULL; /* unusued */ + so->arrayKeys[numArrayKeys].use_sksup = skipatts[attno_skip - 1].use_sksup; + so->arrayKeys[numArrayKeys].null_elem = true; /* for now */ + so->arrayKeys[numArrayKeys].sksup = skipatts[attno_skip - 1].sksup; + so->arrayKeys[numArrayKeys].low_compare = NULL; /* for now */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* for now */ + + /* + * Temporary testing GUC can disable the use of an opclass's skip + * support routine + */ + if (!skipscan_skipsupport_enabled) + so->arrayKeys[numArrayKeys].use_sksup = false; + + /* + * We'll need a 3-way ORDER proc to determine when and how the + * consed-up "array" will advance inside _bt_advance_array_keys. + * Set one up now. + */ + _bt_setup_array_cmp(scan, cur, opcintype, + &so->orderProcs[output_ikey], NULL); + + /* + * Prepare to output next scan key (might be another skip scan + * key, or it could be an input scan key from scan->keyData[]) + */ + numSkipArrayKeys--; + numArrayKeys++; + attno_skip++; + output_ikey++; /* keep this scan key/array */ + } + /* * Copy input scan key into temp arrayKeyData scan key array. (From * here on, cur points at our copy of the input scan key.) @@ -522,6 +669,10 @@ _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys) so->arrayKeys[numArrayKeys].scan_key = output_ikey; so->arrayKeys[numArrayKeys].num_elems = num_elems; so->arrayKeys[numArrayKeys].elem_values = elem_values; + so->arrayKeys[numArrayKeys].null_elem = false; /* unused */ + so->arrayKeys[numArrayKeys].use_sksup = false; /* redundant */ + so->arrayKeys[numArrayKeys].low_compare = NULL; /* unused */ + so->arrayKeys[numArrayKeys].high_compare = NULL; /* unused */ numArrayKeys++; output_ikey++; /* keep this scan key/array */ } @@ -635,7 +786,8 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) { BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; - Assert(array->num_elems > 0); + Assert(array->num_elems > 0 || array->num_elems == -1); + Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD); if (array->scan_key == input_ikey) { @@ -696,6 +848,211 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) so->numArrayKeys, INDEX_MAX_KEYS))); } +/* + * _bt_decide_skipatts() -- set index attributes requiring skip arrays + * + * _bt_preprocess_array_keys helper function. Determines which attributes + * will require skip arrays/scan keys. Also sets up skip support callbacks + * for attributes whose input opclass have skip support (opclasses without + * skip support will fall back on using next-key sentinel values when + * advancing the skip array to its next array element). + * + * Return value is the total number of scan keys to add as "input" scan keys + * for further processing within _bt_preprocess_keys. + */ +static int +_bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts) +{ + Relation rel = scan->indexRelation; + ScanKey inputsk; + AttrNumber attno_inputsk = 1, + attno_skip = 1; + bool attno_has_equal = false, + attno_has_rowcompare = false; + int numSkipArrayKeys = 0, + prev_numSkipArrayKeys = 0; + + Assert(scan->numberOfKeys); + + /* + * FIXME Don't support parallel index scans for now. + * + * _bt_parallel_primscan_schedule must be taught to account for skip + * arrays. This is likely to require that we store the current array + * element datum in shared memory. + */ + if (scan->parallel_scan) + return 0; + + /* + * Only add skip arrays (and associated scan keys) when doing so will + * enable _bt_preprocess_keys to mark one or more lower-order input scan + * keys (user-visible scan keys taken from scan->keyData[] input array) as + * required to continue the scan. + */ + inputsk = &scan->keyData[0]; + for (int i = 0;; inputsk++, i++) + { + /* + * Backfill skip arrays for any wholly omitted attributes prior to + * attno_inputsk + */ + while (attno_skip < attno_inputsk) + { + if (!_bt_skipsupport(rel, attno_skip, &skipatts[attno_skip - 1])) + { + /* + * Opclass lacks a suitable skip support routine. + * + * Return prev_numSkipArrayKeys, so as to avoid including any + * "backfilled" arrays that were supposed to form a contiguous + * group with a skip array on this attribute. There is no + * benefit to adding backfill skip arrays unless we can do so + * for all attributes (all attributes up to and including the + * one immediately before attno_inputsk). + */ + return prev_numSkipArrayKeys; + } + + /* plan on adding a backfill skip array for this attribute */ + numSkipArrayKeys++; + attno_skip++; + } + + /* + * Stop once past the final input scan key. We deliberately never add + * a skip attribute for the attribute of the last input scan key. + * + * If the last input scan key(s) use equality strategy, then a skip + * attribute is superfluous at best. If the last input scan key uses + * an inequality strategy, then adding a skip scan array/scan key is a + * valid though suboptimal transformation. It is better to arrange + * for preprocessing to allow such an input inequality scan key to + * remain an inequality on output. That way _bt_checkkeys will be + * able to make best use of both of its precheck optimizations, but + * _bt_first will be no less capable of efficiently finding the + * starting position for each primitive index scan. + */ + if (i >= scan->numberOfKeys) + break; + + /* + * Cannot keep adding skip arrays after a RowCompare + */ + if (attno_has_rowcompare) + break; + + /* + * Apply temporary testing GUC that can be used to disable skipping + * (either in part or in whole) + */ + if (attno_inputsk > skipscan_prefix_cols) + break; + + /* + * Now consider next attno_inputsk (or keep going if this is an + * additional scan key against the same attribute) + */ + if (attno_inputsk < inputsk->sk_attno) + { + prev_numSkipArrayKeys = numSkipArrayKeys; + + /* + * Now add skip array for previous scan key's attribute, though + * only if the attribute has no equality strategy scan keys. + * + * Adding skip arrays to an attribute that has one or more + * inequality scan keys will cause preprocessing to output a range + * skip array. This will happen when preprocessing proper deals + * with the redundancy between the array and its inequalities. + */ + skipatts[attno_skip - 1].eq_op = InvalidOid; + if (!attno_has_equal) + { + /* Only saw inequalities for the prior attribute */ + if (_bt_skipsupport(rel, attno_skip, &skipatts[attno_skip - 1])) + { + /* add a range skip array for this attribute */ + numSkipArrayKeys++; + } + else + break; + } + else + { + /* + * Saw an equality for the prior attribute, so it doesn't need + * a skip array (not even a range skip array) + */ + } + + /* Set things up for this new attribute */ + attno_skip++; + attno_inputsk = inputsk->sk_attno; + attno_has_equal = false; + } + + /* + * Track if this scan key's attribute has any equality strategy scan + * keys. + * + * Treat IS NULL scan keys as using equal strategy (they'll be marked + * as using it later on, by _bt_fix_scankey_strategy). + */ + if (inputsk->sk_strategy == BTEqualStrategyNumber || + (inputsk->sk_flags & SK_SEARCHNULL)) + attno_has_equal = true; + + /* + * We don't support RowCompare transformation. Remember that we saw a + * RowCompare, so that we don't keep adding skip attributes. + * + * We do still backfill skip attributes before the RowCompare, so that + * it can be marked required. This is similar to what happens when a + * conventional inequality uses an opclass that lacks skip support. + */ + if (inputsk->sk_flags & SK_ROW_HEADER) + attno_has_rowcompare = true; + } + + return numSkipArrayKeys; +} + +/* + * _bt_skipsupport() -- set up skip support function in *skipatts + * + * Returns true on success, indicating that we set *skipatts with input + * opclass's equality operator. Otherwise returns false. + */ +static bool +_bt_skipsupport(Relation rel, int add_skip_attno, BTSkipPreproc *skipatts) +{ + int16 *indoption = rel->rd_indoption; + Oid opfamily = rel->rd_opfamily[add_skip_attno - 1]; + Oid opcintype = rel->rd_opcintype[add_skip_attno - 1]; + bool reverse; + + /* Look up input opclass's equality operator (might fail) */ + skipatts->eq_op = get_opfamily_member(opfamily, opcintype, opcintype, + BTEqualStrategyNumber); + + /* + * We don't really expect input opclasses lacking even an equality + * operator, but they're still supported. Deal with them gracefully. + */ + if (!OidIsValid(skipatts->eq_op)) + return false; + + /* Have skip support infrastructure set all SkipSupport fields */ + reverse = (indoption[add_skip_attno - 1] & INDOPTION_DESC) != 0; + skipatts->use_sksup = PrepareSkipSupportFromOpclass(opfamily, opcintype, + reverse, + &skipatts->sksup); + + /* might not have set up skip support routine, but can skip either way */ + return true; +} + /* * _bt_setup_array_cmp() -- Set up array comparison functions * @@ -988,17 +1345,15 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey FmgrInfo *orderproc, BTArrayKeyInfo *array, bool *qual_ok) { + BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; - int cmpresult = 0, - cmpexact = 0, - matchelem, - new_nelems = 0; FmgrInfo crosstypeproc; FmgrInfo *orderprocp = orderproc; + MemoryContext oldContext; + bool eliminated; Assert(arraysk->sk_attno == skey->sk_attno); - Assert(array->num_elems > 0); Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); Assert((arraysk->sk_flags & SK_SEARCHARRAY) && arraysk->sk_strategy == BTEqualStrategyNumber); @@ -1011,8 +1366,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey * datum of opclass input type for the index's attribute (on-disk type). * We can reuse the array's ORDER proc whenever the non-array scan key's * type is a match for the corresponding attribute's input opclass type. - * Otherwise, we have to do another ORDER proc lookup so that our call to - * _bt_binsrch_array_skey applies the correct comparator. + * Otherwise, we have to do another ORDER proc lookup. We have to be sure + * that _bt_compare_array_skey/_bt_binsrch_array_skey use the right proc. * * Note: we have to support the convention that sk_subtype == InvalidOid * means the opclass input type; this is a hack to simplify life for @@ -1043,11 +1398,65 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey return false; } - /* We have all we need to determine redundancy/contradictoriness */ + /* We successfully looked up the required cross-type ORDER proc */ orderprocp = &crosstypeproc; fmgr_info(cmp_proc, orderprocp); } + oldContext = MemoryContextSwitchTo(so->arrayContext); + + /* + * Perform preprocessing of the array based on whether it's a conventional + * array, or a skip array. Sets *qual_ok correctly in passing. + */ + if (array->num_elems != -1) + { + _bt_array_preproc_shrink(arraysk, skey, orderprocp, array, qual_ok); + + /* + * We successfully looked up the required cross-type ORDER proc, which + * ensured that the scalar scan key could be eliminated as redundant + */ + eliminated = true; + } + else + { + /* + * With a skip array it's possible that we won't be able to eliminate + * the scalar scan key, despite looking up the required ORDER proc. + * This happens when earlier preprocessing wasn't able to eliminate a + * redundant scan key inequality due to a lack of cross-type support. + */ + eliminated = _bt_skip_preproc_shrink(scan, arraysk, skey, orderprocp, + array, qual_ok); + } + + MemoryContextSwitchTo(oldContext); + + return eliminated; +} + +/* + * Finish off preprocessing of conventional (non-skip) array scan key when it + * is redundant with (or contradicted by) a non-array scalar scan key. + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + * + * Rewrites caller's array in-place as needed to eliminate redundant array + * elements. Calling here always renders caller's scalar scan key redundant. + */ +static void +_bt_array_preproc_shrink(ScanKey arraysk, ScanKey skey, FmgrInfo *orderprocp, + BTArrayKeyInfo *array, bool *qual_ok) +{ + int cmpresult = 0, + cmpexact = 0, + matchelem, + new_nelems = 0; + + Assert(array->num_elems > 0); + Assert(!(arraysk->sk_flags & SK_BT_SKIP)); + matchelem = _bt_binsrch_array_skey(orderprocp, false, NoMovementScanDirection, skey->sk_argument, false, array, @@ -1099,6 +1508,137 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey array->num_elems = new_nelems; *qual_ok = new_nelems > 0; +} + +/* + * Finish off preprocessing of skip array scan key when it is "redundant with" + * a non-array scalar scan key. The scalar scan key must be an inequality. + * _bt_compare_array_scankey_args helper function, called after the relevant + * (potentially cross-type) ORDER proc has been looked up successfully. + * + * Unlike _bt_array_preproc_shrink, we cannot really modify caller's array + * in-place. Skip arrays work by procedurally generating their elements as + * needed, so our approach is to store a copy of the inequality in the skip + * array, allowing its elements to be generated within the limits of a range. + * Calling here always renders caller's scalar scan key redundant (the key is + * applied when the array advances, but that's just an implementation detail). + * + * Return value indicates if the array already had a lower/upper bound + * (whichever caller's scalar scan key was expected to be). We return true in + * the common case where caller's scan key could be successfully rolled into + * the skip array. We return false when we can't do that due to the presence + * of a conflicting inequality. + */ +static bool +_bt_skip_preproc_shrink(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderprocp, BTArrayKeyInfo *array, + bool *qual_ok) +{ + bool test_result; + + /* + * We don't expect to have to deal with NULLs in non-array/non-skip scan + * key. We expect _bt_preprocess_array_keys to avoid generating a skip + * array for an index attribute with an IS NULL input scan key. (It will + * still do so in the presence of IS NOT NULL input scan keys, but + * _bt_compare_scankey_args is expected to handle those for us.) + */ + Assert(arraysk->sk_flags & SK_BT_SKIP); + Assert(arraysk->sk_flags & SK_SEARCHARRAY); + Assert(arraysk->sk_strategy == BTEqualStrategyNumber); + Assert(array->num_elems == -1); + + /* Scalar scan key must be a B-Tree inequality, which are always strict */ + Assert(!(skey->sk_flags & SK_ISNULL)); + Assert(skey->sk_strategy != BTEqualStrategyNumber); + + /* + * Array must not generate a NULL array element (for "IS NULL" qual). Its + * index attribute is constrained by a strict operator, so NULL elements + * must not be returned by the scan (it would be wrong to allow it). + */ + array->null_elem = false; + *qual_ok = true; + + /* + * Store a copy of caller's scalar scan key, plus a copy of the operator's + * corresponding 3-way ORDER proc. + * + * A skip array scan key always uses the underlying index attribute's + * input opclass, but it's possible that caller's scalar scan key uses a + * cross-type operator. In cross-type scenarios, skey.sk_argument doesn't + * use the same type as later array elements (which are all just copies of + * datums taken from index tuples, possibly modified by skip support). + * + * We represent the lowest (and highest) possible value in the array using + * the sentinel value -inf (+inf for high_compare). The only exceptions + * apply when the opclass has skip support: there we can use a copy of the + * skip support routine's low_elem/high_elem instead -- though only when + * there is no corresponding low_compare/high_compare inequality. + * + * _bt_first understands that -inf/+inf indicate that it should use the + * low_compare/high_compare inequality for initial positioning purposes + * when it sees either value (unless there is no corresponding inequality, + * in which case the values are literally interpreted as -inf or +inf). + * _bt_first can therefore vary in whether it uses a cross-type operator, + * or an input-opclass-only operator (it can vary across primitive scans + * for the same index attribute/skip array). + * + * _bt_scankey_decrement/_bt_scankey_increment both make sure that each + * newly generated element is constrained by low_compare/high_compare. + * This must happen without skey.sk_argument ever being treated as a true + * array element (that wouldn't always work because array elements are + * only ever supposed to use the opclass input type). + */ + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (array->high_compare) + { + /* try to keep only one high_compare inequality */ + if (!_bt_compare_scankey_args(scan, array->high_compare, skey, + array->high_compare, NULL, NULL, + &test_result)) + return false; /* can't make new high_compare redundant */ + + if (!test_result) + return true; /* discard new high_compare */ + + /* replace old high_compare with new one */ + } + else + array->high_compare = palloc(sizeof(ScanKeyData)); + + memcpy(array->high_compare, skey, sizeof(ScanKeyData)); + array->order_high = *orderprocp; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (array->low_compare) + { + /* try to keep only one low_compare inequality */ + if (!_bt_compare_scankey_args(scan, array->low_compare, skey, + array->low_compare, NULL, NULL, + &test_result)) + return false; /* can't make new low_compare redundant */ + + if (!test_result) + return true; /* discard new low_compare */ + + /* replace old low_compare with new one */ + } + else + array->low_compare = palloc(sizeof(ScanKeyData)); + + memcpy(array->low_compare, skey, sizeof(ScanKeyData)); + array->order_low = *orderprocp; + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } return true; } @@ -1141,7 +1681,8 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, - Datum arrdatum, ScanKey cur) + Datum arrdatum, bool arrnull, + ScanKey cur) { int32 result = 0; @@ -1149,14 +1690,14 @@ _bt_compare_array_skey(FmgrInfo *orderproc, if (tupnull) /* NULL tupdatum */ { - if (cur->sk_flags & SK_ISNULL) + if (arrnull) result = 0; /* NULL "=" NULL */ else if (cur->sk_flags & SK_BT_NULLS_FIRST) result = -1; /* NULL "<" NOT_NULL */ else result = 1; /* NULL ">" NOT_NULL */ } - else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + else if (arrnull) /* NOT_NULL tupdatum, NULL arrdatum */ { if (cur->sk_flags & SK_BT_NULLS_FIRST) result = 1; /* NOT_NULL ">" NULL */ @@ -1222,6 +1763,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum arrdatum; Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(!(cur->sk_flags & SK_BT_SKIP)); + Assert(!(cur->sk_flags & SK_ISNULL)); /* plain arrays can't do this */ Assert(cur->sk_strategy == BTEqualStrategyNumber); if (cur_elem_trig) @@ -1257,7 +1800,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[low_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result <= 0) { @@ -1285,7 +1828,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, { arrdatum = array->elem_values[high_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result >= 0) { @@ -1312,7 +1855,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, arrdatum = array->elem_values[mid_elem]; result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - arrdatum, cur); + arrdatum, false, cur); if (result == 0) { @@ -1337,13 +1880,102 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, */ if (low_elem != mid_elem) result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, - array->elem_values[low_elem], cur); + array->elem_values[low_elem], false, + cur); *set_elem_result = result; return low_elem; } +/* + * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array + * + * This routine doesn't return an index into the array, because the array + * doesn't actually have any elements (it generates its array elements + * procedurally instead). Note that this may include a NULL value/an IS NULL + * qual. + * + * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true + * array. The value 0 indicates that tupdatum/tupnull is within the range of + * the skip array. Other values indicate what _bt_compare_array_skey returned + * for the best available match to tupdatum/tupnull (in practice this means + * either the lowest item or the highest item in the range of the array). + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key. We use this to optimize-away comparisons that are known by our + * caller to be unnecessary from context, just like _bt_binsrch_array_skey. + */ +static void +_bt_binsrch_skiparray_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_flags & SK_BT_REQFWD); + Assert(array->num_elems == -1); + Assert(!ScanDirectionIsNoMovement(dir)); + + if (tupnull) /* NULL tupdatum */ + { + if (array->null_elem) + *set_elem_result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + *set_elem_result = -1; /* NULL "<" NOT_NULL */ + else + *set_elem_result = 1; /* NULL ">" NOT_NULL */ + + return; + } + + /* + * Array inequalities determine whether tupdatum is within the range of + * caller's skip array + */ + *set_elem_result = 0; + if (ScanDirectionIsForward(dir)) + { + /* + * Evaluate low_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate high_compare + */ + if (!cur_elem_trig && array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + else if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + } + else + { + /* + * Evaluate high_compare first (unless cur_elem_trig tells us that it + * cannot possibly fail to be satisfied), then evaluate low_compare + */ + if (!cur_elem_trig && array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + tupdatum, + array->high_compare->sk_argument))) + *set_elem_result = 1; + else if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + tupdatum, + array->low_compare->sk_argument))) + *set_elem_result = -1; + } +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -1353,29 +1985,506 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc, void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; Assert(so->numArrayKeys); Assert(so->qual_ok); - for (i = 0; i < so->numArrayKeys; i++) + for (int i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->keyData[curArrayKey->scan_key]; - Assert(curArrayKey->num_elems > 0); Assert(skey->sk_flags & SK_SEARCHARRAY); - if (ScanDirectionIsBackward(dir)) - curArrayKey->cur_elem = curArrayKey->num_elems - 1; - else - curArrayKey->cur_elem = 0; - skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; + _bt_scankey_set_low_or_high(rel, skey, curArrayKey, + ScanDirectionIsForward(dir)); } so->scanBehind = so->oppoDirCheck = false; /* reset */ } +/* + * _bt_scankey_set_low_or_high() -- Set array scan key to lowest/highest element + * + * Caller also passes associated scan key, which will have its argument set to + * the lowest/highest array value in passing. + */ +static void +_bt_scankey_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + bool low_not_high) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + + if (array->num_elems != -1) + { + /* set low or high element for conventional array */ + int set_elem = 0; + + Assert(!(skey->sk_flags & SK_BT_SKIP)); + + if (!low_not_high) + set_elem = array->num_elems - 1; + + /* + * Just copy over array datum (only skip arrays require freeing and + * allocating memory for sk_argument) + */ + array->cur_elem = set_elem; + skey->sk_argument = array->elem_values[set_elem]; + + return; + } + + /* set low or high element for skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Clear possibly-irrelevant flags */ + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEGPOSINF | SK_BT_NEXTPRIOR); + + if (array->null_elem && + (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) + { + /* Lowest (or highest) element is NULL, so set scan key to NULL */ + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); + } + else if (low_not_high) + { + /* Lowest array element isn't NULL */ + if (array->use_sksup && !array->low_compare) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else + skey->sk_flags |= SK_BT_NEGPOSINF; + } + else + { + /* Highest array element isn't NULL */ + if (array->use_sksup && !array->high_compare) + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); + else + skey->sk_flags |= SK_BT_NEGPOSINF; + } +} + +/* + * _bt_scankey_set_element() -- Set skip array scan key's sk_argument + * + * Sets scan key to "IS NULL" when required, and handles memory management for + * pass-by-reference types. + */ +static void +_bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, + Datum tupdatum, bool tupnull) +{ + /* tupdatum within the range of low_value/high_value */ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(tupnull && !array->null_elem)); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = (Datum) 0; + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEGPOSINF | SK_BT_NEXTPRIOR); + + /* + * Treat tupdatum/tupnull as a matching array element. + * + * We just copy tupdatum into the array's scan key (there is no + * conventional array element for us to set, of course). + * + * Unlike standard arrays, skip arrays sometimes need to locate NULLs. + * Treat them as just another value from the domain of indexed values. + */ + if (!tupnull) + skey->sk_argument = datumCopy(tupdatum, attr->attbyval, attr->attlen); + else + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + +/* + * _bt_scankey_unset_isnull() -- increment/decrement scan key from NULL + * + * Unsets scan key's "IS NULL" marking, and sets the non-NULL value from the + * array immediately before (or immediate after) NULL in the key space. + */ +static void +_bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(skey->sk_flags & SK_SEARCHNULL); + Assert(skey->sk_flags & SK_ISNULL); + Assert(!(skey->sk_flags & (SK_BT_NEGPOSINF | SK_BT_NEXTPRIOR))); + Assert(skey->sk_argument == 0); + Assert(array->use_sksup && array->null_elem && + !array->low_compare && !array->high_compare); + + /* + * sk_argument must be set to whatever non-NULL value comes immediately + * before or after NULL + */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); + if (skey->sk_flags & SK_BT_NULLS_FIRST) + skey->sk_argument = datumCopy(array->sksup.low_elem, + attr->attbyval, attr->attlen); + else + skey->sk_argument = datumCopy(array->sksup.high_elem, + attr->attbyval, attr->attlen); +} + +/* + * _bt_scankey_set_isnull() -- decrement/increment scan key to NULL + */ +static void +_bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_BT_SKIP); + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & (SK_SEARCHNULL | SK_ISNULL | + SK_BT_NEGPOSINF | SK_BT_NEXTPRIOR))); + Assert(array->null_elem); + Assert(!array->low_compare && !array->high_compare); + + /* Free memory previously allocated for sk_argument if needed */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + + /* Set sk_argument to NULL */ + skey->sk_argument = (Datum) 0; + skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); +} + +/* + * _bt_scankey_decrement() -- decrement array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully decremented. + * Cannot decrement an array whose current element is already the first one. + */ +static bool +_bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool underflow = false; + Datum dec_sk_argument; + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & SK_BT_NEXTPRIOR)); + + /* Regular (non-skip) array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & SK_BT_SKIP)); + if (array->cur_elem > 0) + { + /* + * Just copy over array datum (only skip arrays require freeing + * and allocating memory for sk_argument) + */ + array->cur_elem--; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully decremented array */ + return true; + } + + /* Cannot decrement to before first array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* The sentinel value -inf is never decrementable */ + if (skey->sk_flags & SK_BT_NEGPOSINF) + return false; + + /* + * When the current array element is NULL, and the lowest sorting value in + * the index is also NULL, we cannot decrement before first array element + */ + if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "decrement" the scan key's current + * element by setting the NEXTPRIOR flag. The true prior value can only + * be determined when the scan reads lower sorting tuples. + * + * When the current array element is NULL, and the highest sorting value + * in the index is also NULL, _bt_first can find the highest non-NULL. + */ + if (!array->use_sksup) + { + /* + * Determine as best we can (given the lack of skip support) whether + * the prior element will turn out to be out of bounds for the skip + * array. + * + * Skip arrays (that lack skip support) can only do this when their + * low_compare is for an >= inequality; if the current array element + * is == the inequality's sk_argument, then the true prior value + * cannot possibly satisfy low_compare. We can give up right away. + */ + if (array->low_compare && + array->low_compare->sk_strategy == BTGreaterEqualStrategyNumber && + _bt_compare_array_skey(&array->order_low, + array->low_compare->sk_argument, false, + skey->sk_argument, false, + skey) == 0) + return false; + + /* else the scan must figure out the true prior value */ + skey->sk_flags |= SK_BT_NEXTPRIOR; + return true; + } + + /* + * Opclasses with skip support decrement the scan key's current element + * using a callback + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * Decrement current array element to the high_elem value provided by + * opclass skip support routine. + */ + _bt_scankey_unset_isnull(rel, skey, array); + return true; + } + + /* + * Ask opclass support routine to provide decremented copy of existing + * non-NULL sk_argument + */ + dec_sk_argument = array->sksup.decrement(rel, skey->sk_argument, &underflow); + + if (underflow) + { + if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument was already equal to non-NULL low_elem + * provided by opclass skip support routine, but skip array's true + * lowest element is actually NULL. + * + * Decrement sk_argument to NULL. + */ + _bt_scankey_set_isnull(rel, skey, array); + return true; + } + + /* Cannot decrement before first array element */ + return false; + } + + /* + * Successfully decremented sk_argument to a non-NULL value. Make sure + * that the decremented value is still within the range of the skip array. + */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (array->low_compare && + !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, + array->low_compare->sk_collation, + dec_sk_argument, + array->low_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!attr->attbyval) + pfree(DatumGetPointer(dec_sk_argument)); + + /* Cannot decrement before first array element */ + return false; + } + + /* Accept non-NULL datum value from opclass decrement callback */ + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = dec_sk_argument; + + return true; +} + +/* + * _bt_scankey_increment() -- increment array scan key's sk_argument + * + * Return value indicates whether caller's array was successfully incremented. + * Cannot increment an array whose current element is already the final one. + */ +static bool +_bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) +{ + bool overflow = false; + Datum inc_sk_argument; + Form_pg_attribute attr; + + Assert(skey->sk_flags & SK_SEARCHARRAY); + Assert(!(skey->sk_flags & SK_BT_NEXTPRIOR)); + + /* Regular (non-skip) array? */ + if (array->num_elems != -1) + { + Assert(!(skey->sk_flags & SK_BT_SKIP)); + if (array->cur_elem < array->num_elems - 1) + { + /* + * Just copy over array datum (only skip arrays require freeing + * and allocating memory for sk_argument) + */ + array->cur_elem++; + skey->sk_argument = array->elem_values[array->cur_elem]; + + /* Successfully incremented array */ + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* Nope, this is a skip array */ + Assert(skey->sk_flags & SK_BT_SKIP); + + /* The sentinel value +inf is never incrementable */ + if (skey->sk_flags & SK_BT_NEGPOSINF) + return false; + + /* + * When the current array element is NULL, and the highest sorting value + * in the index is also NULL, we cannot increment past the final element + */ + if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + return false; + + /* + * Opclasses without skip support "increment" the scan key's current + * element by setting the NEXTPRIOR flag. The true next value can only be + * determined when the scan reads higher sorting tuples. + * + * When the current array element is NULL, and the lowest sorting value in + * the index is also NULL, _bt_first can find the lowest non-NULL. + */ + if (!array->use_sksup) + { + /* + * Determine as best we can (given the lack of skip support) whether + * the next element will turn out to be out of bounds for the skip + * array. + * + * Skip arrays (that lack skip support) can only do this when their + * high_compare is for an <= inequality; if the current array element + * is == the inequality's sk_argument, then the true next value cannot + * possibly satisfy high_compare. We can give up right away. + */ + if (array->high_compare && + array->high_compare->sk_strategy == BTLessEqualStrategyNumber && + _bt_compare_array_skey(&array->order_high, + array->high_compare->sk_argument, false, + skey->sk_argument, false, + skey) == 0) + return false; + + /* else the scan must figure out the true next value */ + skey->sk_flags |= SK_BT_NEXTPRIOR; + return true; + } + + /* + * Opclasses with skip support increment the scan key's current element + * using a callback + */ + if (skey->sk_flags & SK_ISNULL) + { + Assert(skey->sk_flags & SK_BT_NULLS_FIRST); + + /* + * Existing sk_argument/array element is NULL (for an IS NULL qual). + * + * Increment current array element to the low_elem value provided by + * opclass skip support routine. + */ + _bt_scankey_unset_isnull(rel, skey, array); + return true; + } + + /* + * Ask opclass support routine to provide incremented copy of existing + * non-NULL sk_argument + */ + inc_sk_argument = array->sksup.increment(rel, skey->sk_argument, &overflow); + + if (overflow) + { + if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) + { + /* + * Existing sk_argument was already equal to non-NULL high_elem + * provided by opclass skip support routine, but skip array's true + * highest element is actually NULL. + * + * Increment sk_argument to NULL. + */ + _bt_scankey_set_isnull(rel, skey, array); + return true; + } + + /* Cannot increment past final array element */ + return false; + } + + /* + * Successfully incremented sk_argument to a non-NULL value. Make sure + * that the incremented value is still within the range of the skip array. + */ + attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1); + if (array->high_compare && + !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, + array->high_compare->sk_collation, + inc_sk_argument, + array->high_compare->sk_argument))) + { + /* Keep existing sk_argument after all */ + if (!attr->attbyval) + pfree(DatumGetPointer(inc_sk_argument)); + + /* Cannot increment past final array element */ + return false; + } + + /* Accept non-NULL datum value from opclass increment callback */ + if (!attr->attbyval && skey->sk_argument) + pfree(DatumGetPointer(skey->sk_argument)); + skey->sk_argument = inc_sk_argument; + + return true; +} + /* * _bt_advance_array_keys_increment() -- Advance to next set of array elements * @@ -1391,6 +2500,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; /* @@ -1400,29 +2510,30 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) */ for (int i = so->numArrayKeys - 1; i >= 0; i--) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->keyData[curArrayKey->scan_key]; - int cur_elem = curArrayKey->cur_elem; - int num_elems = curArrayKey->num_elems; - bool rolled = false; + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; - if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) + if (ScanDirectionIsForward(dir)) { - cur_elem = 0; - rolled = true; + if (_bt_scankey_increment(rel, skey, array)) + return true; } - else if (ScanDirectionIsBackward(dir) && --cur_elem < 0) + else { - cur_elem = num_elems - 1; - rolled = true; + if (_bt_scankey_decrement(rel, skey, array)) + return true; } - curArrayKey->cur_elem = cur_elem; - skey->sk_argument = curArrayKey->elem_values[cur_elem]; - if (!rolled) - return true; + /* + * Handle array roll over. + * + * Start over at the array's lowest sorting value (or its highest + * value, for backward scans)... + */ + _bt_scankey_set_low_or_high(rel, skey, array, + ScanDirectionIsForward(dir)); - /* Need to advance next array key, if any */ + /* ...then advance next most significant array, if any */ } /* @@ -1477,6 +2588,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; int arrayidx = 0; @@ -1484,7 +2596,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { ScanKey cur = so->keyData + ikey; BTArrayKeyInfo *array = NULL; - int first_elem_dir; if (!(cur->sk_flags & SK_SEARCHARRAY) || cur->sk_strategy != BTEqualStrategyNumber) @@ -1496,16 +2607,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) continue; - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; + Assert(array->num_elems != -1); /* No skipping of non-required arrays */ - if (array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); } } @@ -1569,6 +2674,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) { ScanKey cur = so->keyData + ikey; + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; Datum tupdatum; bool tupnull; int32 result; @@ -1630,9 +2737,67 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); - result = _bt_compare_array_skey(&so->orderProcs[ikey], - tupdatum, tupnull, - cur->sk_argument, cur); + if (!(cur->sk_flags & SK_BT_NEGPOSINF)) + { + /* Just use the array's current array element */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + sk_argument, sk_isnull, cur); + + /* + * When scan key is marked NEXTPRIOR, the current array element is + * "sk_argument + infinitesimal" (or the current array element is + * "sk_argument - infinitesimal", during backwards scans) + */ + if (result == 0 && (cur->sk_flags & SK_BT_NEXTPRIOR)) + { + /* + * tupdatum is actually still < "sk_argument + infinitesimal" + * (or it's actually still > "sk_argument - infinitesimal") + */ + return true; + } + } + else + { + /* + * The scankey lacks a conventional sk_argument/element value, + * since it's marked as containing the sentinel value -inf/+inf. + * + * Note: -inf could mean "absolute" -inf, or it could represent + * the lowest possible value that still satisfies the array's + * low_compare. +inf and high_compare work similarly. + */ + BTArrayKeyInfo *array = NULL; + + for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) + { + array = &so->arrayKeys[arrayidx]; + if (array->scan_key == ikey) + break; + } + + /* + * Compare tupdatum against -inf using array's low_compare, if any + * (or compare it against +inf using array's high_compare). + * + * Optimization: avoid uselessly evaluating array's high_compare + * (or uselessly evaluating array's low_compare) by passing + * cur_elem_trig=true, along with an inverted scan direction. + */ + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], true, -dir, + tupdatum, tupnull, array, cur, + &result); + + if (result == 0) + { + /* + * tupdatum is > -inf sk_argument (or < +inf sk_argument). + * It's time for caller to advance the scan's array keys. + */ + return false; + } + } /* * Does this comparison indicate that caller must _not_ advance the @@ -1964,18 +3129,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (beyond_end_advance) { - int final_elem_dir; - - if (ScanDirectionIsBackward(dir) || !array) - final_elem_dir = 0; - else - final_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != final_elem_dir) - { - array->cur_elem = final_elem_dir; - cur->sk_argument = array->elem_values[final_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); continue; } @@ -2000,18 +3156,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ if (!all_required_satisfied || cur->sk_attno > tupnatts) { - int first_elem_dir; - - if (ScanDirectionIsForward(dir) || !array) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array && array->cur_elem != first_elem_dir) - { - array->cur_elem = first_elem_dir; - cur->sk_argument = array->elem_values[first_elem_dir]; - } + if (array) + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); continue; } @@ -2029,15 +3176,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* * Binary search for closest match that's available from the array */ - set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], - cur_elem_trig, dir, - tupdatum, tupnull, array, cur, - &result); + if (array->num_elems != -1) + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); - Assert(set_elem >= 0 && set_elem < array->num_elems); + /* + * Skip array. "Binary search" by checking if tupdatum/tupnull + * are within the low_value/high_value range of the skip array. + */ + else + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); } else { + Datum sk_argument = cur->sk_argument; + bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0; + Assert(sktrig_required && required); /* @@ -2051,7 +3210,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ result = _bt_compare_array_skey(&so->orderProcs[ikey], tupdatum, tupnull, - cur->sk_argument, cur); + sk_argument, sk_isnull, cur); } /* @@ -2110,11 +3269,76 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } } - /* Advance array keys, even when set_elem isn't an exact match */ - if (array && array->cur_elem != set_elem) + /* Advance array keys, even when we don't have an exact match */ + + if (!array) + continue; /* no element to set in non-array */ + + /* Conventional arrays have a valid set_elem for us to advance to */ + if (array->num_elems != -1) { - array->cur_elem = set_elem; - cur->sk_argument = array->elem_values[set_elem]; + if (array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; + } + + continue; + } + + /* + * Skip arrays generate array elements procedurally and on demand. + * They "contain" elements for every possible datum from a given range + * of values. This is often the range -inf through to +inf. + */ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(array->num_elems == -1); + Assert(required); + + /* + * When a binary search of a conventional array locates a set_elem + * that is merely the best available match for tupdatum (not an exact + * match), set_elem isn't necessarily set to the absolute lowest or + * highest array element (though we must set subsequent lower-order + * !all_required_satisfied arrays that way, as the process cascades). + * + * However, when a "binary search" of a skip array finds that tupdatum + * isn't within the range of the skip array, we always advance the + * array to either the highest or the lowest possible element value + * (it's often set to either the +inf or the -inf element/value). + * There can be no "gaps between array elements", so either we find an + * exact match or we follow the same steps followed for later arrays + * that array advancement will cascade to. + */ + if (beyond_end_advance) + { + /* + * We need to set the array element to the final element in the + * current scan direction for "beyond end of array element" array + * advancement + */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsBackward(dir)); + } + else if (!all_required_satisfied) + { + /* + * The closest matching element is the lowest element; even that + * still puts us ahead of caller's tuple in the key space + */ + Assert(sktrig < ikey); /* Caller must get this right */ + _bt_scankey_set_low_or_high(rel, cur, array, + ScanDirectionIsForward(dir)); + } + else + { + /* + * Search found tupdatum within the range of the skip array. + * + * Set scan key's sk_argument to tupdatum. If tupdatum is null, + * we'll set IS NULL flags in scan key's sk_flags instead. + */ + _bt_scankey_set_element(rel, cur, array, tupdatum, tupnull); } } @@ -2465,6 +3689,8 @@ end_toplevel_scan: * within each attribute may be done as a byproduct of the processing here. * That process must leave array scan keys (within an attribute) in the same * order as corresponding entries from the scan's BTArrayKeyInfo array info. + * We might also cons up skip array scan keys that weren't present in the + * original input keys; these are also output in standard attribute order. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -2588,8 +3814,8 @@ _bt_preprocess_keys(IndexScanDesc scan) inputsk = scan->keyData; /* - * Now that we have an estimate of the number of output scan keys, - * allocate space for them + * Now that we have an estimate of the number of output scan keys + * (including any skip array scan keys), allocate space for them */ so->keyData = palloc(sizeof(ScanKeyData) * numberOfKeys); @@ -2725,7 +3951,8 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); xform[j].skey = NULL; xform[j].ikey = -1; } @@ -2888,7 +4115,8 @@ _bt_preprocess_keys(IndexScanDesc scan) /* Have all we need to determine redundancy */ if (test_result) { - Assert(!array || array->num_elems > 0); + Assert(!array || array->num_elems > 0 || + array->num_elems == -1); /* * New key is more restrictive, and so replaces old key... @@ -3030,10 +4258,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->scan_key != ikey) return false; - if (array->num_elems <= 0) + if (array->num_elems == 0 || array->num_elems < -1) return false; - if (cur->sk_argument != array->elem_values[array->cur_elem]) + if (array->num_elems != -1 && + cur->sk_argument != array->elem_values[array->cur_elem]) return false; if (last_sk_attno > cur->sk_attno) return false; @@ -3108,6 +4337,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool leftnull, rightnull; + /* Handle skip array comparison with IS NOT NULL scan key */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP) + { + /* Shouldn't generate skip array in presence of IS NULL key */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL)); + Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL); + + /* Skip array will have no NULL element/IS NULL scan key */ + Assert(array->num_elems == -1); + array->null_elem = false; + + /* IS NOT NULL key (could be leftarg or rightarg) now redundant */ + *result = true; + return true; + } + if (leftarg->sk_flags & SK_ISNULL) { Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL)); @@ -3181,6 +4426,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, { /* Can't make the comparison */ *result = false; /* suppress compiler warnings */ + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); return false; } @@ -3744,6 +4990,20 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, continue; } + /* + * A skip array scan key might be negative/positive infinity. Might + * also be next key/prior key sentinel, which we don't deal with. + */ + if (key->sk_flags & (SK_BT_NEGPOSINF | SK_BT_NEXTPRIOR)) + { + Assert(key->sk_flags & SK_SEARCHARRAY); + Assert(key->sk_flags & SK_BT_SKIP); + Assert(requiredSameDir); + + *continuescan = false; + return false; + } + /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c index e9d4cd60d..96d0d9185 100644 --- a/src/backend/access/nbtree/nbtvalidate.c +++ b/src/backend/access/nbtree/nbtvalidate.c @@ -114,6 +114,10 @@ btvalidate(Oid opclassoid) case BTOPTIONS_PROC: ok = check_amoptsproc_signature(procform->amproc); break; + case BTSKIPSUPPORT_PROC: + ok = check_amproc_signature(procform->amproc, VOIDOID, true, + 1, 1, INTERNALOID); + break; default: ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index b8b5c147c..a86dbf71b 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -1330,6 +1330,31 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("btree equal image functions must not be cross-type"))); } + else if (member->number == BTSKIPSUPPORT_PROC) + { + if (procform->pronargs != 1 || + procform->proargtypes.values[0] != INTERNALOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must accept type \"internal\""))); + if (procform->prorettype != VOIDOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must return void"))); + + /* + * pg_amproc functions are indexed by (lefttype, righttype), but a + * skip support function doesn't make sense in cross-type + * scenarios. The same opclass opcintype OID is always used for + * lefttype and righttype. Providing a cross-type routine isn't + * sensible. Reject cross-type ALTER OPERATOR FAMILY ... ADD + * FUNCTION 6 statements here. + */ + if (member->lefttype != member->righttype) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("btree skip support functions must not be cross-type"))); + } } else if (amoid == HASH_AM_OID) { diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index edb09d4e3..e945686c8 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -96,6 +96,7 @@ OBJS = \ rowtypes.o \ ruleutils.o \ selfuncs.o \ + skipsupport.o \ tid.o \ timestamp.o \ trigfuncs.o \ diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 9c854e0e5..79658f068 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -34,6 +34,7 @@ #include "utils/date.h" #include "utils/datetime.h" #include "utils/numeric.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" /* @@ -455,6 +456,49 @@ date_sortsupport(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +static Datum +date_decrement(Relation rel, Datum existing, bool *underflow) +{ + DateADT dexisting = DatumGetDateADT(existing); + + if (dexisting == DATEVAL_NOBEGIN) + { + *underflow = true; + return 0; + } + + *underflow = false; + return DateADTGetDatum(dexisting - 1); +} + +static Datum +date_increment(Relation rel, Datum existing, bool *overflow) +{ + DateADT dexisting = DatumGetDateADT(existing); + + if (dexisting == DATEVAL_NOEND) + { + *overflow = true; + return 0; + } + + *overflow = false; + return DateADTGetDatum(dexisting + 1); +} + +Datum +date_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + + sksup->decrement = date_decrement; + sksup->increment = date_increment; + sksup->low_elem = DateADTGetDatum(DATEVAL_NOBEGIN); + sksup->high_elem = DateADTGetDatum(DATEVAL_NOEND); + + PG_RETURN_VOID(); +} + Datum date_finite(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 8c6fc80c3..91682edd5 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -83,6 +83,7 @@ backend_sources += files( 'rowtypes.c', 'ruleutils.c', 'selfuncs.c', + 'skipsupport.c', 'tid.c', 'timestamp.c', 'trigfuncs.c', diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index bf42393be..0e6e9ebb9 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6806,6 +6806,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, List *indexBoundQuals; int indexcol; bool eqQualHere; + bool found_skip; bool found_saop; bool found_is_null_op; double num_sa_scans; @@ -6831,6 +6832,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, indexBoundQuals = NIL; indexcol = 0; eqQualHere = false; + found_skip = false; found_saop = false; found_is_null_op = false; num_sa_scans = 1; @@ -6839,15 +6841,38 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, IndexClause *iclause = lfirst_node(IndexClause, lc); ListCell *lc2; + /* + * XXX For now we just cost skip scans via generic rules: make a + * uniform assumption that there will be 10 primitive index scans per + * skipped attribute, relying on the "1/3 of all index pages" cap that + * this costing has used since Postgres 17. Also assume that skipping + * won't take place for an index that has fewer than 100 pages. + * + * The current approach to costing leaves much to be desired, but is + * at least better than nothing at all (keeping the code as it is on + * HEAD just makes testing and review inconvenient). + */ if (indexcol != iclause->indexcol) { /* Beginning of a new column's quals */ if (!eqQualHere) - break; /* done if no '=' qual for indexcol */ + { + found_skip = true; /* skip when no '=' qual for indexcol */ + if (index->pages < 100) + break; + num_sa_scans += 10; + } eqQualHere = false; indexcol++; if (indexcol != iclause->indexcol) - break; /* no quals at all for indexcol */ + { + /* no quals at all for indexcol */ + found_skip = true; + if (index->pages < 100) + break; + num_sa_scans += 10 * (iclause->indexcol - indexcol); + continue; + } } /* Examine each indexqual associated with this index clause */ @@ -6920,6 +6945,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, if (index->unique && indexcol == index->nkeycolumns - 1 && eqQualHere && + !found_skip && !found_saop && !found_is_null_op) numIndexTuples = 1.0; diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c new file mode 100644 index 000000000..d91471e26 --- /dev/null +++ b/src/backend/utils/adt/skipsupport.c @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * skipsupport.c + * Support routines for B-Tree skip scan. + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/skipsupport.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "utils/lsyscache.h" +#include "utils/skipsupport.h" + +/* + * Fill in SkipSupport given an operator class (opfamily + opcintype). + * + * On success, returns true, and initializes all SkipSupport fields for + * caller. Otherwise returns false, indicating that operator class has no + * skip support function. + */ +bool +PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse, + SkipSupport sksup) +{ + Oid skipSupportFunction; + + /* Look for a skip support function */ + skipSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype, + BTSKIPSUPPORT_PROC); + if (!OidIsValid(skipSupportFunction)) + return false; + + OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup)); + + if (reverse) + { + /* + * DESC/reverse case: swap low_elem with high_elem, and swap decrement + * with increment + */ + Datum low_elem = sksup->low_elem; + SkipSupportIncDec decrement = sksup->decrement; + + sksup->low_elem = sksup->high_elem; + sksup->decrement = sksup->increment; + + sksup->high_elem = low_elem; + sksup->increment = decrement; + } + + return true; +} diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c index 45eb1b2fe..e2d98a62f 100644 --- a/src/backend/utils/adt/uuid.c +++ b/src/backend/utils/adt/uuid.c @@ -13,12 +13,15 @@ #include "postgres.h" +#include + #include "common/hashfn.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "port/pg_bswap.h" #include "utils/fmgrprotos.h" #include "utils/guc.h" +#include "utils/skipsupport.h" #include "utils/sortsupport.h" #include "utils/timestamp.h" #include "utils/uuid.h" @@ -390,6 +393,70 @@ uuid_abbrev_convert(Datum original, SortSupport ssup) return res; } +static Datum +uuid_decrement(Relation rel, Datum existing, bool *underflow) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + *underflow = false; + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] > 0) + { + uuid->data[i]--; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = UCHAR_MAX; + } + + *underflow = true; + + return 0; +} + +static Datum +uuid_increment(Relation rel, Datum existing, bool *overflow) +{ + pg_uuid_t *uuid; + + uuid = (pg_uuid_t *) palloc(UUID_LEN); + memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN); + *overflow = false; + for (int i = UUID_LEN - 1; i >= 0; i--) + { + if (uuid->data[i] < UCHAR_MAX) + { + uuid->data[i]++; + return UUIDPGetDatum(uuid); + } + uuid->data[i] = 0; + } + + *overflow = true; + + return 0; +} + +Datum +uuid_skipsupport(PG_FUNCTION_ARGS) +{ + SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0); + pg_uuid_t *uuid_min = palloc(UUID_LEN); + pg_uuid_t *uuid_max = palloc(UUID_LEN); + + memset(uuid_min->data, 0x00, UUID_LEN); + memset(uuid_max->data, 0xFF, UUID_LEN); + + sksup->decrement = uuid_decrement; + sksup->increment = uuid_increment; + sksup->low_elem = UUIDPGetDatum(uuid_min); + sksup->high_elem = UUIDPGetDatum(uuid_max); + + PG_RETURN_VOID(); +} + /* hash index support */ Datum uuid_hash(PG_FUNCTION_ARGS) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index c0a52cdcc..131b23a8e 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -28,6 +28,7 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/nbtree.h" #include "access/slru.h" #include "access/toast_compression.h" #include "access/twophase.h" @@ -1753,6 +1754,17 @@ struct config_bool ConfigureNamesBool[] = }, #endif + /* XXX Remove before commit */ + { + {"skipscan_skipsupport_enabled", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_skipsupport_enabled, + true, + NULL, NULL, NULL + }, + { {"integer_datetimes", PGC_INTERNAL, PRESET_OPTIONS, gettext_noop("Shows whether datetimes are integer based."), @@ -3587,6 +3599,17 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + /* XXX Remove before commit */ + { + {"skipscan_prefix_cols", PGC_SUSET, DEVELOPER_OPTIONS, + NULL, NULL, + GUC_NOT_IN_SAMPLE + }, + &skipscan_prefix_cols, + INDEX_MAX_KEYS, 0, INDEX_MAX_KEYS, + NULL, NULL, NULL + }, + { /* Can't be set in postgresql.conf */ {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS, diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml index 2b3997988..9662fb2ba 100644 --- a/doc/src/sgml/btree.sgml +++ b/doc/src/sgml/btree.sgml @@ -583,6 +583,19 @@ options(relopts local_relopts *) returns + + skipsupport + + + Optionally, a btree operator family may provide a skip + support function, registered under support function + number 6. These functions allow the B-tree code to more efficiently + navigate the index structure via an index skip scan. The + APIs involved in this are defined in + src/include/utils/skipsupport.h. + + + diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 6d731e070..651f9323e 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -457,23 +457,26 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor); A multicolumn B-tree index can be used with query conditions that involve any subset of the index's columns, but the index is most - efficient when there are constraints on the leading (leftmost) columns. - The exact rule is that equality constraints on leading columns, plus - any inequality constraints on the first column that does not have an - equality constraint, will be used to limit the portion of the index - that is scanned. Constraints on columns to the right of these columns - are checked in the index, so they save visits to the table proper, but - they do not reduce the portion of the index that has to be scanned. + efficient when there are equality constraints on the leading (leftmost) columns. + B-Tree index scans can use the index skip scan strategy to generate + equality constraints on prefix columns that were wholly omitted from the + query predicate, as well as prefix columns whose values were constrained by + inequality conditions. For example, given an index on (a, b, c) and a query condition WHERE a = 5 AND b >= 42 AND c < 77, the index would have to be scanned from the first entry with a = 5 and b = 42 up through the last entry with - a = 5. Index entries with c >= 77 would be - skipped, but they'd still have to be scanned through. + a = 5. Intevening groups of index entries with + c >= 77 would not need to be returned by the scan, + and can be skipped over entirely by applying the skip scan strategy. This index could in principle be used for queries that have constraints on b and/or c with no constraint on a - — but the entire index would have to be scanned, so in most cases - the planner would prefer a sequential table scan over using the index. + — but that approach is generally only taken when there are so few + distinct a values that the planner expects the skip scan + strategy to allow the scan to skip over most individual index leaf pages. + If there are many distinct a values, then the entire + index will have to be scanned, so in most cases the planner ill prefer a + sequential table scan over using the index. @@ -508,10 +511,7 @@ CREATE INDEX test2_mm_idx ON test2 (major, minor); - Multicolumn indexes should be used sparingly. In most situations, - an index on a single column is sufficient and saves space and time. - Indexes with more than three columns are unlikely to be helpful - unless the usage of the table is extremely stylized. See also + Multicolumn indexes should be used judiciously. See and for some discussion of the merits of different index configurations. @@ -669,9 +669,13 @@ CREATE INDEX test3_desc_index ON test3 (id DESC NULLS LAST); multicolumn index on (x, y). This index would typically be more efficient than index combination for queries involving both columns, but as discussed in , it - would be almost useless for queries involving only y, so it - should not be the only index. A combination of the multicolumn index - and a separate index on y would serve reasonably well. For + would be less useful for queries involving only y. Just + how useful might depend on how effective the B-Tree index skip scan + optimization is; if x has no more than several hundred + distinct values, skip scan will make searches for specific + y values execute reasonably efficiently. A combination + of a multicolumn index on (x, y) and a separate index on + y might also serve reasonably well. For queries involving only x, the multicolumn index could be used, though it would be larger and hence slower than an index on x alone. The last alternative is to create all three diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml index 22d8ad1aa..63f03f3a7 100644 --- a/doc/src/sgml/xindex.sgml +++ b/doc/src/sgml/xindex.sgml @@ -461,6 +461,13 @@ 5 + + + Return the addresses of C-callable skip support function(s) + (optional) + + 6 + @@ -1056,7 +1063,8 @@ DEFAULT FOR TYPE int8 USING btree FAMILY integer_ops AS FUNCTION 1 btint8cmp(int8, int8) , FUNCTION 2 btint8sortsupport(internal) , FUNCTION 3 in_range(int8, int8, int8, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint8skipsupport(internal) ; CREATE OPERATOR CLASS int4_ops DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS @@ -1069,7 +1077,8 @@ DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS FUNCTION 1 btint4cmp(int4, int4) , FUNCTION 2 btint4sortsupport(internal) , FUNCTION 3 in_range(int4, int4, int4, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint4skipsupport(internal) ; CREATE OPERATOR CLASS int2_ops DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS @@ -1082,7 +1091,8 @@ DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS FUNCTION 1 btint2cmp(int2, int2) , FUNCTION 2 btint2sortsupport(internal) , FUNCTION 3 in_range(int2, int2, int2, boolean, boolean) , - FUNCTION 4 btequalimage(oid) ; + FUNCTION 4 btequalimage(oid) , + FUNCTION 6 btint2skipsupport(internal) ; ALTER OPERATOR FAMILY integer_ops USING btree ADD -- cross-type comparisons int8 vs int2 diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out index ae54cb254..8b6b775c1 100644 --- a/src/test/regress/expected/alter_generic.out +++ b/src/test/regress/expected/alter_generic.out @@ -362,9 +362,9 @@ ERROR: invalid operator number 0, must be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ERROR: operator argument types must be specified in ALTER OPERATOR FAMILY ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ERROR: invalid function number 0, must be between 1 and 5 -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 -ERROR: invalid function number 6, must be between 1 and 5 +ERROR: invalid function number 0, must be between 1 and 6 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 +ERROR: invalid function number 7, must be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index 3bbe4c5f9..a8d5be6c1 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -5138,9 +5138,10 @@ List of access methods btree | uuid_ops | uuid | uuid | 1 | uuid_cmp btree | uuid_ops | uuid | uuid | 2 | uuid_sortsupport btree | uuid_ops | uuid | uuid | 4 | btequalimage + btree | uuid_ops | uuid | uuid | 6 | uuid_skipsupport hash | uuid_ops | uuid | uuid | 1 | uuid_hash hash | uuid_ops | uuid | uuid | 2 | uuid_hash_extended -(5 rows) +(6 rows) -- check \dconfig set work_mem = 10240; diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql index de58d268d..4246afefd 100644 --- a/src/test/regress/sql/alter_generic.sql +++ b/src/test/regress/sql/alter_generic.sql @@ -310,7 +310,7 @@ ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 6 < (int4, int2); -- ope ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 0 < (int4, int2); -- operator number should be between 1 and 5 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function -ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5 +ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6 ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY DROP OPERATOR FAMILY alt_opf4 USING btree; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 547d14b3e..3dffb3856 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -218,6 +218,7 @@ BTScanPos BTScanPosData BTScanPosItem BTShared +BTSkipPreproc BTSortArrayContext BTSpool BTStack @@ -2660,6 +2661,8 @@ SingleBoundSortItem SinglePartitionSpec Size SkipPages +SkipSupport +SkipSupportData SlabBlock SlabContext SlabSlot -- 2.45.2