v2-0001-Add-skip-scan-to-nbtree.patch
application/octet-stream
Filename: v2-0001-Add-skip-scan-to-nbtree.patch
Type: application/octet-stream
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v2-0001
Subject: Add skip scan to nbtree.
| File | + | − |
|---|---|---|
| doc/src/sgml/btree.sgml | 13 | 0 |
| doc/src/sgml/xindex.sgml | 13 | 3 |
| src/backend/access/nbtree/nbtcompare.c | 201 | 0 |
| src/backend/access/nbtree/nbtree.c | 6 | 4 |
| src/backend/access/nbtree/nbtutils.c | 1239 | 160 |
| src/backend/access/nbtree/nbtvalidate.c | 4 | 0 |
| src/backend/commands/opclasscmds.c | 25 | 0 |
| src/backend/utils/adt/date.c | 34 | 0 |
| src/backend/utils/adt/Makefile | 1 | 0 |
| src/backend/utils/adt/meson.build | 1 | 0 |
| src/backend/utils/adt/selfuncs.c | 28 | 2 |
| src/backend/utils/adt/skipsupport.c | 54 | 0 |
| src/backend/utils/adt/uuid.c | 65 | 0 |
| src/backend/utils/misc/guc_tables.c | 12 | 0 |
| src/include/access/nbtree.h | 14 | 2 |
| src/include/catalog/pg_amproc.dat | 16 | 0 |
| src/include/catalog/pg_proc.dat | 24 | 0 |
| src/include/utils/skipsupport.h | 140 | 0 |
| src/test/regress/expected/alter_generic.out | 3 | 3 |
| src/test/regress/expected/psql.out | 2 | 1 |
| src/test/regress/sql/alter_generic.sql | 1 | 1 |
| src/tools/pgindent/typedefs.list | 3 | 0 |
From d41c1da841e4ab6245caff02d17b945f8346b47b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 16 Apr 2024 13:21:36 -0400
Subject: [PATCH v2] Add skip scan to nbtree.
Skip scan allows nbtree index scans to efficiently use a composite index
on an index (a, b) for queries with a predicate such as "WHERE b = 5".
This is useful in cases where the total number of distinct values in the
column 'a' is reasonably small (think hundreds, possibly thousands).
In effect, a skip scan treats the composite index on (a, b) as if it was
a series of disjunct subindexes -- one subindex per distinct 'a' value.
We exhaustively "search every subindex" using a qual that behaves like
"WHERE a = ANY(<every possible 'a' value>) AND b = 5".
---
src/include/access/nbtree.h | 16 +-
src/include/catalog/pg_amproc.dat | 16 +
src/include/catalog/pg_proc.dat | 24 +
src/include/utils/skipsupport.h | 140 ++
src/backend/access/nbtree/nbtcompare.c | 201 +++
src/backend/access/nbtree/nbtree.c | 10 +-
src/backend/access/nbtree/nbtutils.c | 1399 ++++++++++++++++---
src/backend/access/nbtree/nbtvalidate.c | 4 +
src/backend/commands/opclasscmds.c | 25 +
src/backend/utils/adt/Makefile | 1 +
src/backend/utils/adt/date.c | 34 +
src/backend/utils/adt/meson.build | 1 +
src/backend/utils/adt/selfuncs.c | 30 +-
src/backend/utils/adt/skipsupport.c | 54 +
src/backend/utils/adt/uuid.c | 65 +
src/backend/utils/misc/guc_tables.c | 12 +
doc/src/sgml/btree.sgml | 13 +
doc/src/sgml/xindex.sgml | 16 +-
src/test/regress/expected/alter_generic.out | 6 +-
src/test/regress/expected/psql.out | 3 +-
src/test/regress/sql/alter_generic.sql | 2 +-
src/tools/pgindent/typedefs.list | 3 +
22 files changed, 1899 insertions(+), 176 deletions(-)
create mode 100644 src/include/utils/skipsupport.h
create mode 100644 src/backend/utils/adt/skipsupport.c
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 749304334..81e99fcc1 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -24,6 +24,7 @@
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
+#include "utils/skipsupport.h"
/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
typedef uint16 BTCycleId;
@@ -709,7 +710,8 @@ BTreeTupleGetMaxHeapTID(IndexTuple itup)
#define BTINRANGE_PROC 3
#define BTEQUALIMAGE_PROC 4
#define BTOPTIONS_PROC 5
-#define BTNProcs 5
+#define BTSKIPSUPPORT_PROC 6
+#define BTNProcs 6
/*
* We need to be able to tell the difference between read and write
@@ -1032,9 +1034,15 @@ typedef BTScanPosData *BTScanPos;
typedef struct BTArrayKeyInfo
{
int scan_key; /* index of associated key in keyData */
+ int num_elems; /* number of elems (-1 for skip array) */
+
+ /* State used by standard arrays that store elements in memory */
int cur_elem; /* index of current element in elem_values */
- int num_elems; /* number of elems in current array value */
Datum *elem_values; /* array of num_elems Datums */
+
+ /* State used by skip arrays, which generate elements procedurally */
+ SkipSupportData sksup; /* opclass skip scan support */
+ bool null_elem; /* lowest/highest element actually NULL? */
} BTArrayKeyInfo;
typedef struct BTScanOpaqueData
@@ -1123,6 +1131,7 @@ typedef struct BTReadPageState
*/
#define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */
#define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */
+#define SK_BT_SKIP 0x00040000 /* SK_SEARCHARRAY skip scan key */
#define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */
#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
@@ -1159,6 +1168,9 @@ typedef struct BTOptions
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4
#define PROGRESS_BTREE_PHASE_LEAF_LOAD 5
+/* GUC parameter (just a temporary convenience for reviewers) */
+extern PGDLLIMPORT int skipscan_prefix_cols;
+
/*
* external entry points for btree, in nbtree.c
*/
diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat
index f639c3a6a..2a8f6f3f1 100644
--- a/src/include/catalog/pg_amproc.dat
+++ b/src/include/catalog/pg_amproc.dat
@@ -21,6 +21,8 @@
amprocrighttype => 'bit', amprocnum => '4', amproc => 'btequalimage' },
{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool',
amprocrighttype => 'bool', amprocnum => '1', amproc => 'btboolcmp' },
+{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool',
+ amprocrighttype => 'bool', amprocnum => '6', amproc => 'btboolskipsupport' },
{ amprocfamily => 'btree/bool_ops', amproclefttype => 'bool',
amprocrighttype => 'bool', amprocnum => '4', amproc => 'btequalimage' },
{ amprocfamily => 'btree/bpchar_ops', amproclefttype => 'bpchar',
@@ -41,12 +43,16 @@
amprocrighttype => 'char', amprocnum => '1', amproc => 'btcharcmp' },
{ amprocfamily => 'btree/char_ops', amproclefttype => 'char',
amprocrighttype => 'char', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/char_ops', amproclefttype => 'char',
+ amprocrighttype => 'char', amprocnum => '6', amproc => 'btcharskipsupport' },
{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date',
amprocrighttype => 'date', amprocnum => '1', amproc => 'date_cmp' },
{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date',
amprocrighttype => 'date', amprocnum => '2', amproc => 'date_sortsupport' },
{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date',
amprocrighttype => 'date', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date',
+ amprocrighttype => 'date', amprocnum => '6', amproc => 'date_skipsupport' },
{ amprocfamily => 'btree/datetime_ops', amproclefttype => 'date',
amprocrighttype => 'timestamp', amprocnum => '1',
amproc => 'date_cmp_timestamp' },
@@ -122,6 +128,8 @@
amprocrighttype => 'int2', amprocnum => '2', amproc => 'btint2sortsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2',
amprocrighttype => 'int2', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2',
+ amprocrighttype => 'int2', amprocnum => '6', amproc => 'btint2skipsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2',
amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint24cmp' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int2',
@@ -141,6 +149,8 @@
amprocrighttype => 'int4', amprocnum => '2', amproc => 'btint4sortsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4',
amprocrighttype => 'int4', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4',
+ amprocrighttype => 'int4', amprocnum => '6', amproc => 'btint4skipsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4',
amprocrighttype => 'int8', amprocnum => '1', amproc => 'btint48cmp' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int4',
@@ -160,6 +170,8 @@
amprocrighttype => 'int8', amprocnum => '2', amproc => 'btint8sortsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8',
amprocrighttype => 'int8', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8',
+ amprocrighttype => 'int8', amprocnum => '6', amproc => 'btint8skipsupport' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8',
amprocrighttype => 'int4', amprocnum => '1', amproc => 'btint84cmp' },
{ amprocfamily => 'btree/integer_ops', amproclefttype => 'int8',
@@ -193,6 +205,8 @@
amprocrighttype => 'oid', amprocnum => '2', amproc => 'btoidsortsupport' },
{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid',
amprocrighttype => 'oid', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/oid_ops', amproclefttype => 'oid',
+ amprocrighttype => 'oid', amprocnum => '6', amproc => 'btoidskipsupport' },
{ amprocfamily => 'btree/oidvector_ops', amproclefttype => 'oidvector',
amprocrighttype => 'oidvector', amprocnum => '1',
amproc => 'btoidvectorcmp' },
@@ -261,6 +275,8 @@
amprocrighttype => 'uuid', amprocnum => '2', amproc => 'uuid_sortsupport' },
{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid',
amprocrighttype => 'uuid', amprocnum => '4', amproc => 'btequalimage' },
+{ amprocfamily => 'btree/uuid_ops', amproclefttype => 'uuid',
+ amprocrighttype => 'uuid', amprocnum => '6', amproc => 'uuid_skipsupport' },
{ amprocfamily => 'btree/record_ops', amproclefttype => 'record',
amprocrighttype => 'record', amprocnum => '1', amproc => 'btrecordcmp' },
{ amprocfamily => 'btree/record_image_ops', amproclefttype => 'record',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d4ac578ae..d02dd1a0c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1004,18 +1004,27 @@
{ oid => '3129', descr => 'sort support',
proname => 'btint2sortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'btint2sortsupport' },
+{ oid => '9290', descr => 'skip support',
+ proname => 'btint2skipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btint2skipsupport' },
{ oid => '351', descr => 'less-equal-greater',
proname => 'btint4cmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'int4 int4', prosrc => 'btint4cmp' },
{ oid => '3130', descr => 'sort support',
proname => 'btint4sortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'btint4sortsupport' },
+{ oid => '9291', descr => 'skip support',
+ proname => 'btint4skipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btint4skipsupport' },
{ oid => '842', descr => 'less-equal-greater',
proname => 'btint8cmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'int8 int8', prosrc => 'btint8cmp' },
{ oid => '3131', descr => 'sort support',
proname => 'btint8sortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'btint8sortsupport' },
+{ oid => '9292', descr => 'skip support',
+ proname => 'btint8skipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btint8skipsupport' },
{ oid => '354', descr => 'less-equal-greater',
proname => 'btfloat4cmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'float4 float4', prosrc => 'btfloat4cmp' },
@@ -1034,12 +1043,18 @@
{ oid => '3134', descr => 'sort support',
proname => 'btoidsortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'btoidsortsupport' },
+{ oid => '9293', descr => 'skip support',
+ proname => 'btoidskipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btoidskipsupport' },
{ oid => '404', descr => 'less-equal-greater',
proname => 'btoidvectorcmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'oidvector oidvector', prosrc => 'btoidvectorcmp' },
{ oid => '358', descr => 'less-equal-greater',
proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'char char', prosrc => 'btcharcmp' },
+{ oid => '9294', descr => 'skip support',
+ proname => 'btcharskipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btcharskipsupport' },
{ oid => '359', descr => 'less-equal-greater',
proname => 'btnamecmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'name name', prosrc => 'btnamecmp' },
@@ -2214,6 +2229,9 @@
{ oid => '3136', descr => 'sort support',
proname => 'date_sortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'date_sortsupport' },
+{ oid => '9295', descr => 'skip support',
+ proname => 'date_skipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'date_skipsupport' },
{ oid => '4133', descr => 'window RANGE support',
proname => 'in_range', prorettype => 'bool',
proargtypes => 'date date interval bool bool',
@@ -4368,6 +4386,9 @@
{ oid => '1693', descr => 'less-equal-greater',
proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4',
proargtypes => 'bool bool', prosrc => 'btboolcmp' },
+{ oid => '9296', descr => 'skip support',
+ proname => 'btboolskipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'btboolskipsupport' },
{ oid => '1688', descr => 'hash',
proname => 'time_hash', prorettype => 'int4', proargtypes => 'time',
@@ -9175,6 +9196,9 @@
{ oid => '3300', descr => 'sort support',
proname => 'uuid_sortsupport', prorettype => 'void',
proargtypes => 'internal', prosrc => 'uuid_sortsupport' },
+{ oid => '9297', descr => 'skip support',
+ proname => 'uuid_skipsupport', prorettype => 'void',
+ proargtypes => 'internal', prosrc => 'uuid_skipsupport' },
{ oid => '2961', descr => 'I/O',
proname => 'uuid_recv', prorettype => 'uuid', proargtypes => 'internal',
prosrc => 'uuid_recv' },
diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h
new file mode 100644
index 000000000..a71a624d0
--- /dev/null
+++ b/src/include/utils/skipsupport.h
@@ -0,0 +1,140 @@
+/*-------------------------------------------------------------------------
+ *
+ * skipsupport.h
+ * Support routines for B-Tree skip scans.
+ *
+ * B-Tree operator classes for discrete types can optionally provide a support
+ * function for skipping. This is used during skip scans.
+ *
+ * A B-tree operator class that implements skip support provides B-tree index
+ * scans with a way of enumerating and iterating through every possible value
+ * from the domain of indexable values. This gives scans a way to determine
+ * the next value in line for a given skip array/scan key/skipped attribute.
+ * This happens at the point where the scan determines that another primitive
+ * index scan is required. The next value is used (in combination with at
+ * least one additional lower-order non-skip key, taken from the SQL query) to
+ * relocate the scan, skipping over many irrelevant leaf pages in the process.
+ *
+ * There are many data types/opclasses where implementing a skip support
+ * scheme is inherently impossible (or at least impractical). Obviously, it
+ * would be wrong if the "next" value generated by an opclass was actually
+ * after the true next value (any index tuples with the true next value would
+ * be overlooked by the index scan). This partly explains why opclasses are
+ * under no obligation to implement skip support: a continuous type may have
+ * no way of generating a useful next value.
+ *
+ * Skip scan generally works best with discrete types such as integer, date,
+ * and boolean: types where we expect indexes to contain large groups of
+ * contiguous values (in respect of the leading/skipped index attribute).
+ * When gaps/discontinuities are naturally rare (e.g., a leading identity
+ * column in a composite index, a date column preceding a product_id column),
+ * then it makes sense for the skip scan to optimistically assume that the
+ * next distinct indexable value will find directly matching index tuples.
+ * The B-Tree code can fall back on explicit next-key probes for any opclass
+ * that doesn't include a skip support function, but it's best to provide skip
+ * support whenever possible. The B-Tree code assumes that it's always better
+ * to use the opclass skip support routine where available.
+ *
+ * When a skip scan "bets" that the next indexable value will find an exact
+ * match, there is significant upside, without any accompanying downside.
+ * When this optimistic strategy works out, the scan avoids the cost of an
+ * explicit probe (used in the no-skip-support case to determine the true next
+ * value in the index's skip attribute). When the strategy doesn't work out,
+ * then the scan is no worse off than it would have been without skip support.
+ * The explicit next-key probes used by B-Tree skip scan's fallback path are
+ * very similar to "failed" optimistic searches for the next indexable value
+ * (the next value according to the opclass skip support routine).
+ *
+ * (FIXME Actually, nbtree does no such thing right now, which is considered a
+ * blocker to commit.)
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/skipsupport.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SKIPSUPPORT_H
+#define SKIPSUPPORT_H
+
+#include "utils/relcache.h"
+
+typedef struct SkipSupportData *SkipSupport;
+
+/*
+ * State/callbacks used by skip arrays to procedurally generate elements.
+ *
+ * A BTSKIPSUPPORT_PROC function must set each and every field when called.
+ * If an opclass can only set some of the fields, then it cannot safely
+ * provide a skip support routine (and so must rely on the fallback strategy
+ * used by continuous types, such as numeric).
+ */
+typedef struct SkipSupportData
+{
+ /*
+ * low_elem and high_elem must be set with the lowest and highest possible
+ * values from the domain of indexable values (assuming standard ascending
+ * order). This helps the B-Tree code with finding its initial position
+ * at the leaf level (during the skip scan's first primitive index scan).
+ * In other words, it gives the B-Tree code a useful value to start from,
+ * before any data has been read from the index.
+ *
+ * low_elem and high_elem can also be used to prove that a qual is
+ * unsatisfiable in certain cross-type scenarios.
+ *
+ * low_elem and high_elem are also used by skip scans to determine when
+ * they've reached the final possible value (in the current direction).
+ * It's typical for the scan to run out of leaf pages before it runs out
+ * of unscanned indexable values, but it's still useful for the scan to
+ * have a way to recognize when it has reached the last possible value
+ * (this saves us a useless probe that just lands on the final leaf page).
+ *
+ * Note: the logic for determining that the scan has reached the final
+ * possible value naturally belongs in the B-Tree code. The final value
+ * isn't necessarily the original high_elem/low_elem set by the opclass.
+ * In particular, it'll be a lower/higher value when B-Tree preprocessing
+ * determines that the true range of possible values should be restricted,
+ * due to the presence of an inequality applied to the index's skipped
+ * attribute. These are range skip scans.
+ */
+ Datum low_elem; /* lowest sorting/leftmost non-NULL value */
+ Datum high_elem; /* highest sorting/rightmost non-NULL value */
+
+ /*
+ * Decrement/increment functions.
+ *
+ * Returns a decremented/incremented copy of caller's existing datum,
+ * allocated in caller's memory context (in the case of pass-by-reference
+ * types). It's not okay for these functions to leak any memory.
+ *
+ * Both decrement and increment callbacks are guaranteed to never be
+ * called with a NULL "existing" arg. (In general it is the B-Tree code's
+ * job to worry about NULLs, and about whether indexed values are stored
+ * in ASC order or DESC order.)
+ *
+ * The decrement callback is guaranteed to only be called with an
+ * "existing" value that's strictly > the low_elem set by the opclass.
+ * Similarly, the increment callback is guaranteed to only be called with
+ * an "existing" value that's strictly < the high_elem set by the opclass.
+ * Consequently, opclasses don't have to deal with "overflow" themselves
+ * (though asserting that the B-Tree code got it right is a good idea).
+ *
+ * It's quite possible (and very common) for the B-Tree skip scan caller's
+ * "existing" datum to just be a straight copy of a value that it copied
+ * from the index. Operator classes must be liberal in accepting every
+ * possible representational variation within the underlying data type.
+ * Opclasses don't have to preserve whatever semantically insignificant
+ * information the data type might be carrying around, though.
+ *
+ * Note: < and > are defined by the opclass's ORDER proc in the usual way.
+ */
+ Datum (*decrement) (Relation rel, Datum existing);
+ Datum (*increment) (Relation rel, Datum existing);
+} SkipSupportData;
+
+extern bool PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype,
+ bool reverse, SkipSupport sksup);
+
+#endif /* SKIPSUPPORT_H */
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index 1c72867c8..48a877613 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -58,6 +58,7 @@
#include <limits.h>
#include "utils/fmgrprotos.h"
+#include "utils/skipsupport.h"
#include "utils/sortsupport.h"
#ifdef STRESS_SORT_INT_MIN
@@ -78,6 +79,39 @@ btboolcmp(PG_FUNCTION_ARGS)
PG_RETURN_INT32((int32) a - (int32) b);
}
+static Datum
+bool_decrement(Relation rel, Datum existing)
+{
+ bool bexisting = DatumGetBool(existing);
+
+ Assert(bexisting == true);
+
+ return BoolGetDatum(bexisting - 1);
+}
+
+static Datum
+bool_increment(Relation rel, Datum existing)
+{
+ bool bexisting = DatumGetBool(existing);
+
+ Assert(bexisting == false);
+
+ return BoolGetDatum(bexisting + 1);
+}
+
+Datum
+btboolskipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = bool_decrement;
+ sksup->increment = bool_increment;
+ sksup->low_elem = BoolGetDatum(false);
+ sksup->high_elem = BoolGetDatum(true);
+
+ PG_RETURN_VOID();
+}
+
Datum
btint2cmp(PG_FUNCTION_ARGS)
{
@@ -105,6 +139,39 @@ btint2sortsupport(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+static Datum
+int2_decrement(Relation rel, Datum existing)
+{
+ int16 iexisting = DatumGetInt16(existing);
+
+ Assert(iexisting > PG_INT16_MIN);
+
+ return Int16GetDatum(iexisting - 1);
+}
+
+static Datum
+int2_increment(Relation rel, Datum existing)
+{
+ int16 iexisting = DatumGetInt16(existing);
+
+ Assert(iexisting < PG_INT16_MAX);
+
+ return Int16GetDatum(iexisting + 1);
+}
+
+Datum
+btint2skipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = int2_decrement;
+ sksup->increment = int2_increment;
+ sksup->low_elem = Int16GetDatum(PG_INT16_MIN);
+ sksup->high_elem = Int16GetDatum(PG_INT16_MAX);
+
+ PG_RETURN_VOID();
+}
+
Datum
btint4cmp(PG_FUNCTION_ARGS)
{
@@ -128,6 +195,39 @@ btint4sortsupport(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+static Datum
+int4_decrement(Relation rel, Datum existing)
+{
+ int32 iexisting = DatumGetInt32(existing);
+
+ Assert(iexisting > PG_INT32_MIN);
+
+ return Int32GetDatum(iexisting - 1);
+}
+
+static Datum
+int4_increment(Relation rel, Datum existing)
+{
+ int32 iexisting = DatumGetInt32(existing);
+
+ Assert(iexisting < PG_INT32_MAX);
+
+ return Int32GetDatum(iexisting + 1);
+}
+
+Datum
+btint4skipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = int4_decrement;
+ sksup->increment = int4_increment;
+ sksup->low_elem = Int32GetDatum(PG_INT32_MIN);
+ sksup->high_elem = Int32GetDatum(PG_INT32_MAX);
+
+ PG_RETURN_VOID();
+}
+
Datum
btint8cmp(PG_FUNCTION_ARGS)
{
@@ -171,6 +271,39 @@ btint8sortsupport(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+static Datum
+int8_decrement(Relation rel, Datum existing)
+{
+ int64 iexisting = DatumGetInt64(existing);
+
+ Assert(iexisting > PG_INT64_MIN);
+
+ return Int64GetDatum(iexisting - 1);
+}
+
+static Datum
+int8_increment(Relation rel, Datum existing)
+{
+ int64 iexisting = DatumGetInt64(existing);
+
+ Assert(iexisting < PG_INT64_MAX);
+
+ return Int64GetDatum(iexisting + 1);
+}
+
+Datum
+btint8skipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = int8_decrement;
+ sksup->increment = int8_increment;
+ sksup->low_elem = Int64GetDatum(PG_INT64_MIN);
+ sksup->high_elem = Int64GetDatum(PG_INT64_MAX);
+
+ PG_RETURN_VOID();
+}
+
Datum
btint48cmp(PG_FUNCTION_ARGS)
{
@@ -292,6 +425,39 @@ btoidsortsupport(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+static Datum
+oid_decrement(Relation rel, Datum existing)
+{
+ Oid oexisting = DatumGetObjectId(existing);
+
+ Assert(oexisting > InvalidOid);
+
+ return ObjectIdGetDatum(oexisting - 1);
+}
+
+static Datum
+oid_increment(Relation rel, Datum existing)
+{
+ Oid oexisting = DatumGetObjectId(existing);
+
+ Assert(oexisting < OID_MAX);
+
+ return ObjectIdGetDatum(oexisting + 1);
+}
+
+Datum
+btoidskipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = oid_decrement;
+ sksup->increment = oid_increment;
+ sksup->low_elem = ObjectIdGetDatum(InvalidOid);
+ sksup->high_elem = ObjectIdGetDatum(OID_MAX);
+
+ PG_RETURN_VOID();
+}
+
Datum
btoidvectorcmp(PG_FUNCTION_ARGS)
{
@@ -325,3 +491,38 @@ btcharcmp(PG_FUNCTION_ARGS)
/* Be careful to compare chars as unsigned */
PG_RETURN_INT32((int32) ((uint8) a) - (int32) ((uint8) b));
}
+
+static Datum
+char_decrement(Relation rel, Datum existing)
+{
+ uint8 cexisting = UInt8GetDatum(existing);
+
+ Assert(cexisting > 0);
+
+ return CharGetDatum((uint8) cexisting - 1);
+}
+
+static Datum
+char_increment(Relation rel, Datum existing)
+{
+ uint8 cexisting = UInt8GetDatum(existing);
+
+ Assert(cexisting < UCHAR_MAX);
+
+ return CharGetDatum((uint8) cexisting + 1);
+}
+
+Datum
+btcharskipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = char_decrement;
+ sksup->increment = char_increment;
+
+ /* btcharcmp compares chars as unsigned */
+ sksup->low_elem = UInt8GetDatum(0);
+ sksup->high_elem = UInt8GetDatum(UCHAR_MAX);
+
+ PG_RETURN_VOID();
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 686a3206f..9c9cd48f7 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -324,11 +324,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
BTScanPosInvalidate(so->currPos);
BTScanPosInvalidate(so->markPos);
- if (scan->numberOfKeys > 0)
- so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
- else
- so->keyData = NULL;
+ so->keyData = NULL;
so->needPrimScan = false;
so->scanBehind = false;
so->arrayKeys = NULL;
@@ -408,6 +405,11 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
scan->numberOfKeys * sizeof(ScanKeyData));
so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
so->numArrayKeys = 0; /* ditto */
+
+ /* Release private storage allocated in previous btrescan, if any */
+ if (so->keyData != NULL)
+ pfree(so->keyData);
+ so->keyData = NULL;
}
/*
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index d6de2072d..f4442d014 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -28,10 +28,44 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+#include "utils/skipsupport.h"
+
+/*
+ * GUC parameter (temporary convenience for reviewers).
+ *
+ * To disable all skipping, set skipscan_prefix_cols=0. Otherwise set it to
+ * the attribute number that you wish to make the last attribute number that
+ * we can add a skip scan key for.
+ *
+ * For example, setting skipscan_prefix_cols=1 before an index scan with qual
+ * "WHERE b = 1 AND c > 42" will make us generate a skip scan key on the
+ * column 'a' (which is attnum 1) only, preventing us from adding one for the
+ * column 'c' (and so 'c' will still have an inequality scan key, required in
+ * only one direction -- 'c' won't be output as a "range" skip key/array).
+ *
+ * The same scan keys will be output when skipscan_prefix_cols=2, given the
+ * same query/qual, since we naturally get a required equality scan key on 'b'
+ * from the input scan keys (provided we at least manage to add a skip scan
+ * key on 'a' that "anchors its required-ness" to the 'b' scan key.)
+ *
+ * When skipscan_prefix_cols is set to the number of key columns in the index,
+ * we're as aggressive as possible about adding skip scan arrays/scan keys.
+ * This is the current default behavior, and the behavior we're targeting for
+ * the committed patch (if there are slowdowns from being maximally aggressive
+ * here then the likely solution is to make _bt_advance_array_keys adaptive,
+ * rather than trying to predict what will work during preprocessing).
+ */
+int skipscan_prefix_cols;
#define LOOK_AHEAD_REQUIRED_RECHECKS 3
#define LOOK_AHEAD_DEFAULT_DISTANCE 5
+typedef struct BTSkipPreproc
+{
+ SkipSupportData sksup; /* opclass skip scan support */
+ Oid eq_op; /* InvalidOid means don't skip */
+} BTSkipPreproc;
+
typedef struct BTSortArrayContext
{
FmgrInfo *sortproc;
@@ -62,18 +96,49 @@ static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
ScanKey arraysk, ScanKey skey,
FmgrInfo *orderproc, BTArrayKeyInfo *array,
bool *qual_ok);
-static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan);
+static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys);
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
+static int _bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts);
+static bool _bt_skip_support(Relation rel, int add_skip_attno,
+ BTSkipPreproc *skipatts);
+static inline Datum _bt_apply_decrement(Relation rel, ScanKey skey,
+ BTArrayKeyInfo *array);
+static inline Datum _bt_apply_increment(Relation rel, ScanKey skey,
+ BTArrayKeyInfo *array);
static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
Datum tupdatum, bool tupnull,
- Datum arrdatum, ScanKey cur);
+ Datum arrdatum, bool arrnull,
+ ScanKey cur);
+static void _bt_apply_compare_array(ScanKey arraysk, ScanKey skey,
+ FmgrInfo *orderprocp,
+ BTArrayKeyInfo *array, bool *qual_ok);
+static void _bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk,
+ ScanKey skey, FmgrInfo *orderproc,
+ FmgrInfo *orderprocp,
+ BTArrayKeyInfo *array, bool *qual_ok);
static int _bt_binsrch_array_skey(FmgrInfo *orderproc,
bool cur_elem_trig, ScanDirection dir,
Datum tupdatum, bool tupnull,
BTArrayKeyInfo *array, ScanKey cur,
int32 *set_elem_result);
+static void _bt_binsrch_skiparray_skey(FmgrInfo *orderproc,
+ bool cur_elem_trig, ScanDirection dir,
+ Datum tupdatum, bool tupnull,
+ BTArrayKeyInfo *array, ScanKey cur,
+ int32 *set_elem_result);
+static void _bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
+static void _bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
+static void _bt_scankey_set_low_or_high(Relation rel, ScanKey skey,
+ BTArrayKeyInfo *array, bool low_not_high);
+static void _bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array,
+ Datum tupdatum, bool tupnull);
+static void _bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
+static void _bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_advance_skip_array_key_increment(Relation rel, ScanDirection dir,
+ BTArrayKeyInfo *array, ScanKey skey,
+ FmgrInfo *orderproc);
static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
@@ -251,9 +316,6 @@ _bt_freestack(BTStack stack)
* It is convenient for _bt_preprocess_keys caller to have to deal with no
* more than one equality strategy array scan key per index attribute. We'll
* always be able to set things up that way when complete opfamilies are used.
- * Eliminated array scan keys can be recognized as those that have had their
- * sk_strategy field set to InvalidStrategy here by us. Caller should avoid
- * including these in the scan's so->keyData[] output array.
*
* We set the scan key references from the scan's BTArrayKeyInfo info array to
* offsets into the temp modified input array returned to caller. Scans that
@@ -261,18 +323,36 @@ _bt_freestack(BTStack stack)
* preprocessing steps are complete. This will convert the scan key offset
* references into references to the scan's so->keyData[] output scan keys.
*
+ * We're also responsible for generating skip arrays (and their associated
+ * scan keys) here. This enables skip scan. We do this for index attributes
+ * that initially lacked an equality condition within scan->keyData[], iff
+ * doing so allows a later scan key (that was passed to us in scan->keyData[])
+ * to be marked required by later preprocessing on output.
+ * _bt_decide_skipatts decides which attributes receive skip arrays.
+ *
+ * Caller must pass *numberOfKeys to give us a way to change the number of
+ * input scan keys (our output is caller's input). The returned array can be
+ * smaller than scan->keyData[] when we eliminated a redundant array scan key
+ * (redundant with some other array scan key, for the same attribute). It can
+ * also be larger when we added a skip array/skip scan key. Caller uses this
+ * to allocate so->keyData[] for the current btrescan.
+ *
* Note: the reason we need to return a temp scan key array, rather than just
* scribbling on scan->keyData, is that callers are permitted to call btrescan
* without supplying a new set of scankey data.
*/
static ScanKey
-_bt_preprocess_array_keys(IndexScanDesc scan)
+_bt_preprocess_array_keys(IndexScanDesc scan, int *numberOfKeys)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Relation rel = scan->indexRelation;
- int numberOfKeys = scan->numberOfKeys;
+ int numArrayKeyData = scan->numberOfKeys;
int16 *indoption = rel->rd_indoption;
- int numArrayKeys;
+ BTSkipPreproc skipatts[INDEX_MAX_KEYS];
+ int numArrayKeys,
+ numSkipArrayKeys,
+ output_ikey = 0;
+ AttrNumber attno_skip = 1;
int origarrayatt = InvalidAttrNumber,
origarraykey = -1;
Oid origelemtype = InvalidOid;
@@ -280,11 +360,14 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
MemoryContext oldContext;
ScanKey arrayKeyData; /* modified copy of scan->keyData */
- Assert(numberOfKeys);
+ Assert(scan->numberOfKeys);
- /* Quick check to see if there are any array keys */
+ /*
+ * Quick check to see if there are any array keys, or any missing keys we
+ * can generate a "skip scan" array key for ourselves
+ */
numArrayKeys = 0;
- for (int i = 0; i < numberOfKeys; i++)
+ for (int i = 0; i < scan->numberOfKeys; i++)
{
cur = &scan->keyData[i];
if (cur->sk_flags & SK_SEARCHARRAY)
@@ -300,6 +383,18 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
}
}
+ /* Consider generating skip arrays, and associated equality scan keys */
+ numSkipArrayKeys = _bt_decide_skipatts(scan, skipatts);
+ if (numSkipArrayKeys)
+ {
+ /* At least one skip array scan key must be added to arrayKeyData[] */
+ numArrayKeys += numSkipArrayKeys;
+ /* output scan key buffer allocation needs space for skip scan keys */
+ numArrayKeyData += numSkipArrayKeys;
+
+ elog(DEBUG2, "skipping %d index attributes", numSkipArrayKeys);
+ }
+
/* Quit if nothing to do. */
if (numArrayKeys == 0)
return NULL;
@@ -317,19 +412,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
oldContext = MemoryContextSwitchTo(so->arrayContext);
- /* Create modifiable copy of scan->keyData in the workspace context */
- arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
- memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData));
+ /* Create output scan keys in the workspace context */
+ arrayKeyData = (ScanKey) palloc(numArrayKeyData * sizeof(ScanKeyData));
/* Allocate space for per-array data in the workspace context */
so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
/* Allocate space for ORDER procs used to help _bt_checkkeys */
- so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
+ so->orderProcs = (FmgrInfo *) palloc(numArrayKeyData * sizeof(FmgrInfo));
- /* Now process each array key */
+ /*
+ * Process each array key, and generate skip arrays as needed. Also copy
+ * every scan->keyData[] input scan key (whether it's an array or not)
+ * into the arrayKeyData array we'll return to our caller (barring any
+ * array scan keys that we could eliminate early through array merging).
+ */
numArrayKeys = 0;
- for (int i = 0; i < numberOfKeys; i++)
+ for (int input_ikey = 0; input_ikey < scan->numberOfKeys; input_ikey++)
{
FmgrInfo sortproc;
FmgrInfo *sortprocp = &sortproc;
@@ -345,14 +444,78 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
int num_nonnulls;
int j;
- cur = &arrayKeyData[i];
- if (!(cur->sk_flags & SK_SEARCHARRAY))
- continue;
+ /* Create a skip array and scan key where indicated by skipatts */
+ while (numSkipArrayKeys &&
+ attno_skip <= scan->keyData[input_ikey].sk_attno)
+ {
+ Oid opcintype = rel->rd_opcintype[attno_skip - 1];
+ Oid collation = rel->rd_indcollation[attno_skip - 1];
+ Oid eq_op = skipatts[attno_skip - 1].eq_op;
+ RegProcedure cmp_proc;
+
+ if (!OidIsValid(eq_op))
+ {
+ /* won't skip using this attribute */
+ attno_skip++;
+ continue;
+ }
+
+ cmp_proc = get_opcode(eq_op);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing oprcode for skipping equals operator %u", eq_op);
+
+ cur = &arrayKeyData[output_ikey];
+ Assert(attno_skip <= scan->keyData[input_ikey].sk_attno);
+ ScanKeyEntryInitialize(cur,
+ SK_SEARCHARRAY | SK_BT_SKIP, /* flags */
+ attno_skip, /* skipped att number */
+ BTEqualStrategyNumber, /* equality strategy */
+ InvalidOid, /* opclass input subtype */
+ collation, /* index column's collation */
+ cmp_proc, /* equality operator's proc */
+ (Datum) 0); /* constant */
+
+ /* Initialize array fields */
+ so->arrayKeys[numArrayKeys].scan_key = output_ikey;
+ so->arrayKeys[numArrayKeys].num_elems = -1;
+ so->arrayKeys[numArrayKeys].cur_elem = 0;
+ so->arrayKeys[numArrayKeys].elem_values = NULL; /* unusued */
+ so->arrayKeys[numArrayKeys].sksup = skipatts[attno_skip - 1].sksup;
+ so->arrayKeys[numArrayKeys].null_elem = true; /* for now */
+
+ /*
+ * We'll need a 3-way ORDER proc to determine when and how the
+ * consed-up "array" will advance inside _bt_advance_array_keys.
+ * Set one up now.
+ */
+ _bt_setup_array_cmp(scan, cur, opcintype,
+ &so->orderProcs[output_ikey], NULL);
+
+ /*
+ * Prepare to output next scan key (might be another skip scan
+ * key, or it could be an input scan key from scan->keyData[])
+ */
+ numSkipArrayKeys--;
+ numArrayKeys++;
+ attno_skip++;
+ output_ikey++; /* keep this scan key/array */
+ }
/*
- * First, deconstruct the array into elements. Anything allocated
- * here (including a possibly detoasted array value) is in the
- * workspace context.
+ * Copy input scan key into temp arrayKeyData scan key array. (From
+ * here on, cur points at our copy of the input scan key.)
+ */
+ cur = &arrayKeyData[output_ikey];
+ *cur = scan->keyData[input_ikey];
+
+ if (!(cur->sk_flags & SK_SEARCHARRAY))
+ {
+ output_ikey++; /* keep this non-array scan key */
+ continue;
+ }
+
+ /*
+ * Deconstruct the array into elements
*/
arrayval = DatumGetArrayTypeP(cur->sk_argument);
/* We could cache this data, but not clear it's worth it */
@@ -406,6 +569,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
_bt_find_extreme_element(scan, cur, elemtype,
BTGreaterStrategyNumber,
elem_values, num_nonnulls);
+ output_ikey++; /* keep this transformed scan key */
continue;
case BTEqualStrategyNumber:
/* proceed with rest of loop */
@@ -416,6 +580,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
_bt_find_extreme_element(scan, cur, elemtype,
BTLessStrategyNumber,
elem_values, num_nonnulls);
+ output_ikey++; /* keep this transformed scan key */
continue;
default:
elog(ERROR, "unrecognized StrategyNumber: %d",
@@ -432,7 +597,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
* sortproc just points to the same proc used during binary searches.
*/
_bt_setup_array_cmp(scan, cur, elemtype,
- &so->orderProcs[i], &sortprocp);
+ &so->orderProcs[output_ikey], &sortprocp);
/*
* Sort the non-null elements and eliminate any duplicates. We must
@@ -476,11 +641,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
break;
}
- /*
- * Indicate to _bt_preprocess_keys caller that it must ignore
- * this scan key
- */
- cur->sk_strategy = InvalidStrategy;
+ /* Throw away this array */
continue;
}
@@ -511,12 +672,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
* Note: _bt_preprocess_array_keys_final will fix-up each array's
* scan_key field later on, after so->keyData[] has been finalized.
*/
- so->arrayKeys[numArrayKeys].scan_key = i;
+ so->arrayKeys[numArrayKeys].scan_key = output_ikey;
so->arrayKeys[numArrayKeys].num_elems = num_elems;
so->arrayKeys[numArrayKeys].elem_values = elem_values;
+ so->arrayKeys[numArrayKeys].null_elem = false; /* unused */
numArrayKeys++;
+ output_ikey++; /* keep this scan key/array */
}
+ /* Set final number of arrayKeyData[] keys, array keys */
+ *numberOfKeys = output_ikey;
so->numArrayKeys = numArrayKeys;
MemoryContextSwitchTo(oldContext);
@@ -624,7 +789,8 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
{
BTArrayKeyInfo *array = &so->arrayKeys[arrayidx];
- Assert(array->num_elems > 0);
+ Assert(array->num_elems > 0 || array->num_elems == -1);
+ Assert(array->num_elems != -1 || outkey->sk_flags & SK_BT_REQFWD);
if (array->scan_key == input_ikey)
{
@@ -685,6 +851,245 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
so->numArrayKeys, INDEX_MAX_KEYS)));
}
+/*
+ * _bt_decide_skipatts() -- set index attributes requiring skip arrays
+ *
+ * _bt_preprocess_array_keys helper function. Determines which attributes
+ * will require skip arrays/scan keys. Also sets up skip support function for
+ * each of these attributes.
+ *
+ * This sets up "skip scan". Adding skip arrays (and associated scan keys)
+ * allows _bt_preprocess_keys to mark lower-order scan keys (copied from the
+ * original scan->keyData[] array in the conventional way) as required. The
+ * overall effect is to enable skipping over irrelevant sections of the index.
+ *
+ * Return value is the total number of scan keys to add as "input" scan keys
+ * for further processing within _bt_preprocess_keys.
+ */
+static int
+_bt_decide_skipatts(IndexScanDesc scan, BTSkipPreproc *skipatts)
+{
+ Relation rel = scan->indexRelation;
+ ScanKey inputsk;
+ AttrNumber attno_inputsk = 1,
+ attno_skip = 1;
+ bool attno_has_equal = false,
+ attno_has_rowcompare = false;
+ int numSkipArrayKeys = 0,
+ prev_numSkipArrayKeys = 0;
+
+ Assert(scan->numberOfKeys);
+
+ /*
+ * XXX Don't support system catalogs for now. Calls to routines like
+ * get_opfamily_member() are prone to infinite recursion, which we'll need
+ * to find workaround for (hard-coded lookups?).
+ */
+ if (IsCatalogRelation(rel))
+ return 0;
+
+ /*
+ * FIXME Also don't support parallel scans for now. Must add logic to
+ * places like _bt_parallel_primscan_schedule so that we account for skip
+ * arrays when parallel workers serialize their array scan state.
+ */
+ if (scan->parallel_scan)
+ return 0;
+
+ inputsk = &scan->keyData[0];
+ for (int i = 0;; inputsk++, i++)
+ {
+ /*
+ * Backfill skip arrays for any wholly omitted attributes prior to
+ * attno_inputsk
+ */
+ while (attno_skip < attno_inputsk)
+ {
+ if (!_bt_skip_support(rel, attno_skip, &skipatts[attno_skip - 1]))
+ {
+ /*
+ * Opclass lacks a suitable skip support routine.
+ *
+ * Return prev_numSkipArrayKeys, so as to avoid including any
+ * "backfilled" arrays that were supposed to form a contiguous
+ * group with a skip array on this attribute. There is no
+ * benefit to adding backfill skip arrays unless we can do so
+ * for all attributes (all attributes up to and including the
+ * one immediately before attno_inputsk).
+ */
+ return prev_numSkipArrayKeys;
+ }
+
+ /* plan on adding a backfill skip array for this attribute */
+ numSkipArrayKeys++;
+ attno_skip++;
+ }
+
+ /*
+ * Stop once past the final input scan key. We deliberately never add
+ * a skip attribute for the attribute of the last input scan key.
+ *
+ * If the last input scan key(s) use equality strategy, then a skip
+ * attribute is superfluous at best. If the last input scan key uses
+ * an inequality strategy, then adding a skip scan array/scan key is a
+ * valid though suboptimal transformation. It is better to arrange
+ * for preprocessing to allow such an input inequality scan key to
+ * remain an inequality on output. That way _bt_checkkeys will be
+ * able to make best use of both of its precheck optimizations, but
+ * _bt_first will be no less capable of efficiently finding the
+ * starting position for each primitive index scan.
+ */
+ if (i >= scan->numberOfKeys)
+ break;
+
+ /*
+ * Cannot keep adding skip arrays after a RowCompare
+ */
+ if (attno_has_rowcompare)
+ break;
+
+ /*
+ * Apply temporary testing GUC that can be used to disable skipping
+ * (either in part or in whole)
+ */
+ if (attno_inputsk > skipscan_prefix_cols)
+ break;
+
+ /*
+ * Now consider next attno_inputsk (or keep going if this is an
+ * additional scan key against the same attribute)
+ */
+ if (attno_inputsk < inputsk->sk_attno)
+ {
+ prev_numSkipArrayKeys = numSkipArrayKeys;
+
+ /*
+ * Now add skip array for previous scan key's attribute, though
+ * only if the attribute has no equality strategy scan keys.
+ *
+ * Adding skip arrays to an attribute that has one or more
+ * inequality scan keys will cause preprocessing to output a range
+ * skip array. This will happen when preprocessing proper deals
+ * with the redundancy between the array and its inequalities.
+ */
+ skipatts[attno_skip - 1].eq_op = InvalidOid;
+ if (!attno_has_equal)
+ {
+ /* Only saw inequalities for the prior attribute */
+ if (_bt_skip_support(rel, attno_skip,
+ &skipatts[attno_skip - 1]))
+ {
+ /* add a range skip array for this attribute */
+ numSkipArrayKeys++;
+ }
+ else
+ break;
+ }
+ else
+ {
+ /*
+ * Saw an equality for the prior attribute, so it doesn't need
+ * a skip array (not even a range skip array). We'll be able
+ * to add later skip arrays, too (doesn't matter if the prior
+ * attribute uses an input opclass without skip support).
+ */
+ }
+
+ /* Set things up for this new attribute */
+ attno_skip++;
+ attno_inputsk = inputsk->sk_attno;
+ attno_has_equal = false;
+ }
+
+ /*
+ * Track if this scan key's attribute has any equality strategy scan
+ * keys.
+ *
+ * Treat IS NULL scan keys as using equal strategy (they'll be marked
+ * as using it later on, by _bt_fix_scankey_strategy).
+ */
+ if (inputsk->sk_strategy == BTEqualStrategyNumber ||
+ (inputsk->sk_flags & SK_SEARCHNULL))
+ attno_has_equal = true;
+
+ /*
+ * We don't support RowCompare transformation. Remember that we saw a
+ * RowCompare, so that we don't keep adding skip attributes.
+ *
+ * We do still backfill skip attributes before the RowCompare, so that
+ * it can be marked required. This is similar to what happens when a
+ * conventional inequality uses an opclass that lacks skip support.
+ */
+ if (inputsk->sk_flags & SK_ROW_HEADER)
+ attno_has_rowcompare = true;
+ }
+
+ return numSkipArrayKeys;
+}
+
+/*
+ * _bt_skip_support() -- set up skip support function in *skipatts
+ *
+ * Returns true on success, indicating that we set *skipatts with input
+ * opclass's skip support routine for caller. Otherwise returns false.
+ */
+static bool
+_bt_skip_support(Relation rel, int add_skip_attno, BTSkipPreproc *skipatts)
+{
+ int16 *indoption = rel->rd_indoption;
+ Oid opfamily = rel->rd_opfamily[add_skip_attno - 1];
+ Oid opcintype = rel->rd_opcintype[add_skip_attno - 1];
+ bool reverse;
+
+ /* Look up input opclass's equality operator */
+ skipatts->eq_op = get_opfamily_member(opfamily, opcintype, opcintype,
+ BTEqualStrategyNumber);
+
+ /*
+ * We don't expect input opclasses lacking even an equality operator, but
+ * it's possible. Deal with it gracefully.
+ */
+ if (!OidIsValid(skipatts->eq_op))
+ return false;
+
+ /* Have skip support infrastructure set all SkipSupport fields */
+ reverse = (indoption[add_skip_attno - 1] & INDOPTION_DESC) != 0;
+ return PrepareSkipSupportFromOpclass(opfamily, opcintype, reverse,
+ &skipatts->sksup);
+}
+
+/*
+ * _bt_apply_decrement() -- Get a decremented copy of skey's arg
+ *
+ * Note: this wrapper function calls the opclass increment function when the
+ * index stores values in descending order. We're "logically decrementing" to
+ * the previous value in the key space regardless.
+ */
+static inline Datum
+_bt_apply_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ if (!(skey->sk_flags & SK_BT_DESC))
+ return array->sksup.decrement(rel, skey->sk_argument);
+ else
+ return array->sksup.increment(rel, skey->sk_argument);
+}
+
+/*
+ * _bt_apply_increment() -- Get an incremented copy of skey's arg
+ *
+ * Note: this wrapper function calls the opclass decrement function when the
+ * index stores values in descending order. We're "logically incrementing" to
+ * the next value in the key space regardless.
+ */
+static inline Datum
+_bt_apply_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ if (!(skey->sk_flags & SK_BT_DESC))
+ return array->sksup.increment(rel, skey->sk_argument);
+ else
+ return array->sksup.decrement(rel, skey->sk_argument);
+}
+
/*
* _bt_setup_array_cmp() -- Set up array comparison functions
*
@@ -979,15 +1384,10 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey
{
Relation rel = scan->indexRelation;
Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1];
- int cmpresult = 0,
- cmpexact = 0,
- matchelem,
- new_nelems = 0;
FmgrInfo crosstypeproc;
FmgrInfo *orderprocp = orderproc;
Assert(arraysk->sk_attno == skey->sk_attno);
- Assert(array->num_elems > 0);
Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER)));
Assert((arraysk->sk_flags & SK_SEARCHARRAY) &&
arraysk->sk_strategy == BTEqualStrategyNumber);
@@ -1000,8 +1400,8 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey
* datum of opclass input type for the index's attribute (on-disk type).
* We can reuse the array's ORDER proc whenever the non-array scan key's
* type is a match for the corresponding attribute's input opclass type.
- * Otherwise, we have to do another ORDER proc lookup so that our call to
- * _bt_binsrch_array_skey applies the correct comparator.
+ * Otherwise, we have to do another ORDER proc lookup. We have to be sure
+ * that _bt_compare_array_skey/_bt_binsrch_array_skey use the right proc.
*
* Note: we have to support the convention that sk_subtype == InvalidOid
* means the opclass input type; this is a hack to simplify life for
@@ -1032,11 +1432,46 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey
return false;
}
- /* We have all we need to determine redundancy/contradictoriness */
orderprocp = &crosstypeproc;
fmgr_info(cmp_proc, orderprocp);
}
+ /*
+ * We have all we need to determine redundancy/contradictoriness.
+ *
+ * Perform preprocessing of the array based on whether it's a conventional
+ * array, or a skip array. Sets *qual_ok correctly in passing.
+ */
+ if (array->num_elems != -1)
+ _bt_apply_compare_array(arraysk, skey,
+ orderprocp, array, qual_ok);
+ else
+ _bt_apply_compare_skiparray(scan, arraysk, skey,
+ orderproc, orderprocp,
+ array, qual_ok);
+
+ return true;
+}
+
+/*
+ * Finish off preprocessing of conventional (non-skip) array scan key when it
+ * is redundant with (or contradicted by) a non-array scalar scan key.
+ *
+ * _bt_compare_array_scankey_args helper function, called after the relevant
+ * (potentially cross-type) ORDER proc has been looked up successfully.
+ */
+static void
+_bt_apply_compare_array(ScanKey arraysk, ScanKey skey, FmgrInfo *orderprocp,
+ BTArrayKeyInfo *array, bool *qual_ok)
+{
+ int cmpresult = 0,
+ cmpexact = 0,
+ matchelem,
+ new_nelems = 0;
+
+ Assert(array->num_elems > 0);
+ Assert(!(arraysk->sk_flags & SK_BT_SKIP));
+
matchelem = _bt_binsrch_array_skey(orderprocp, false,
NoMovementScanDirection,
skey->sk_argument, false, array,
@@ -1088,8 +1523,152 @@ _bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey
array->num_elems = new_nelems;
*qual_ok = new_nelems > 0;
+}
- return true;
+/*
+ * Finish off preprocessing of skip array scan key when it is redundant with
+ * (or contradicted by) a non-array scalar scan key.
+ *
+ * _bt_compare_array_scankey_args helper function, called after the relevant
+ * (potentially cross-type) ORDER proc has been looked up successfully.
+ *
+ * Arrays used to skip (skip scan/missing key attribute predicates) work by
+ * procedurally generating their elements on the fly. We must still
+ * "eliminate contradictory elements", but it works a little differently: we
+ * narrow the range of the skip array, such that the array will never
+ * generated contradicted-by-skey elements.
+ *
+ * FIXME Our behavior in scenarios with cross-type operators (range skip scan
+ * cases) is buggy. We're naively copying datums of a different type from
+ * scalar inequality scan keys into the array's low_value and high_value
+ * fields. In practice this tends to not visibly break (in practice types
+ * that appear within the same operator family tend to have compatible datum
+ * representations, at least on systems with little-endian byte order). Put
+ * off dealing with the problem until a later revision of the patch.
+ *
+ * It seems likely that the best way to fix this problem will involve keeping
+ * around the original operator in the BTArrayKeyInfo array struct whenever
+ * we're passed a "redundant" cross-type inequality operator (an approach
+ * involving casts/coercions might be tempting, but seems much too fragile).
+ * We only need to use not-column-input-opclass-type operators for the first
+ * and/or last array elements from the skip array under this scheme; we'll
+ * still mostly be dealing with opcintype-typed datums, copied from the index
+ * (as well as incrementing/decrementing copies of those index tuple datums).
+ * Importantly, this scheme should work just as well with an opfamily that
+ * doesn't even have an orderprocp cross-type ORDER operator to pass us here
+ * (we might even have to keep more than one same-strategy inequality, since
+ * in general _bt_preprocess_keys might not be able to prove which inequality
+ * is redundant).
+ */
+static void
+_bt_apply_compare_skiparray(IndexScanDesc scan, ScanKey arraysk, ScanKey skey,
+ FmgrInfo *orderproc, FmgrInfo *orderprocp,
+ BTArrayKeyInfo *array, bool *qual_ok)
+{
+ Relation rel = scan->indexRelation;
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Form_pg_attribute attr = TupleDescAttr(RelationGetDescr(rel),
+ skey->sk_attno - 1);
+ MemoryContext oldContext;
+ int cmpresult;
+
+ /*
+ * We don't expect to have to deal with NULLs in non-array/non-skip scan
+ * key. We expect _bt_preprocess_array_keys to avoid generating a skip
+ * array for an index attribute with an IS NULL input scan key. It will
+ * still do so in the presence of IS NOT NULL input scan keys, but
+ * _bt_compare_scankey_args is expected to handle those for us.
+ */
+ Assert(arraysk->sk_flags & SK_BT_SKIP);
+ Assert(arraysk->sk_flags & SK_SEARCHARRAY);
+ Assert(!(skey->sk_flags & SK_ISNULL));
+ Assert(array->num_elems == -1);
+
+ /*
+ * Scalar scan key must be a B-Tree operator, which must always be strict.
+ * Array shouldn't generate a NULL "array element"/an IS NULL qual. This
+ * isn't just an optimization; it's strictly necessary for correctness.
+ */
+ array->null_elem = false;
+
+ switch (skey->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+
+ /*
+ * detect if scan key argument will be < low_value once
+ * decremented
+ */
+ cmpresult = _bt_compare_array_skey(orderprocp,
+ skey->sk_argument, false,
+ array->sksup.low_elem, false,
+ arraysk);
+ if (cmpresult <= 0)
+ {
+ /* decrementing would make qual unsatisfiable, so don't try */
+ *qual_ok = false;
+ return;
+ }
+
+ /* decremented scan key value becomes skip array's new high_value */
+ oldContext = MemoryContextSwitchTo(so->arrayContext);
+ array->sksup.high_elem = _bt_apply_decrement(rel, skey, array);
+ MemoryContextSwitchTo(oldContext);
+ break;
+ case BTLessEqualStrategyNumber:
+ oldContext = MemoryContextSwitchTo(so->arrayContext);
+ array->sksup.high_elem = datumCopy(skey->sk_argument,
+ attr->attbyval, attr->attlen);
+ MemoryContextSwitchTo(oldContext);
+ break;
+ case BTEqualStrategyNumber:
+ /* _bt_preprocess_array_keys should have avoided this */
+ elog(ERROR, "equality strategy scan key conflicts with skip key for attribute %d on index \"%s\"",
+ skey->sk_attno, RelationGetRelationName(rel));
+ break;
+ case BTGreaterEqualStrategyNumber:
+ oldContext = MemoryContextSwitchTo(so->arrayContext);
+ array->sksup.low_elem = datumCopy(skey->sk_argument,
+ attr->attbyval, attr->attlen);
+ MemoryContextSwitchTo(oldContext);
+ break;
+ case BTGreaterStrategyNumber:
+
+ /*
+ * detect if scan key argument will be > high_value once
+ * incremented
+ */
+ cmpresult = _bt_compare_array_skey(orderprocp,
+ skey->sk_argument, false,
+ array->sksup.high_elem, false,
+ arraysk);
+ if (cmpresult >= 0)
+ {
+ /* incrementing would make qual unsatisfiable, so don't try */
+ *qual_ok = false;
+ return;
+ }
+
+ /* incremented scan key value becomes skip array's new low_value */
+ oldContext = MemoryContextSwitchTo(so->arrayContext);
+ array->sksup.low_elem = _bt_apply_increment(rel, skey, array);
+ MemoryContextSwitchTo(oldContext);
+ break;
+ default:
+ elog(ERROR, "unrecognized StrategyNumber: %d",
+ (int) skey->sk_strategy);
+ break;
+ }
+
+ /*
+ * Is the qual contradictory, or is it merely "redundant" with consed-up
+ * skip array?
+ */
+ cmpresult = _bt_compare_array_skey(orderproc, /* don't use orderprocp */
+ array->sksup.low_elem, false,
+ array->sksup.high_elem, false,
+ arraysk);
+ *qual_ok = (cmpresult <= 0);
}
/*
@@ -1130,7 +1709,8 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg)
static inline int32
_bt_compare_array_skey(FmgrInfo *orderproc,
Datum tupdatum, bool tupnull,
- Datum arrdatum, ScanKey cur)
+ Datum arrdatum, bool arrnull,
+ ScanKey cur)
{
int32 result = 0;
@@ -1138,14 +1718,14 @@ _bt_compare_array_skey(FmgrInfo *orderproc,
if (tupnull) /* NULL tupdatum */
{
- if (cur->sk_flags & SK_ISNULL)
+ if (arrnull)
result = 0; /* NULL "=" NULL */
else if (cur->sk_flags & SK_BT_NULLS_FIRST)
result = -1; /* NULL "<" NOT_NULL */
else
result = 1; /* NULL ">" NOT_NULL */
}
- else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */
+ else if (arrnull) /* NOT_NULL tupdatum, NULL arrdatum */
{
if (cur->sk_flags & SK_BT_NULLS_FIRST)
result = 1; /* NOT_NULL ">" NULL */
@@ -1211,6 +1791,8 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
Datum arrdatum;
Assert(cur->sk_flags & SK_SEARCHARRAY);
+ Assert(!(cur->sk_flags & SK_BT_SKIP));
+ Assert(!(cur->sk_flags & SK_ISNULL)); /* plain arrays can't do this */
Assert(cur->sk_strategy == BTEqualStrategyNumber);
if (cur_elem_trig)
@@ -1246,7 +1828,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
{
arrdatum = array->elem_values[low_elem];
result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
- arrdatum, cur);
+ arrdatum, false, cur);
if (result <= 0)
{
@@ -1274,7 +1856,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
{
arrdatum = array->elem_values[high_elem];
result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
- arrdatum, cur);
+ arrdatum, false, cur);
if (result >= 0)
{
@@ -1301,7 +1883,7 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
arrdatum = array->elem_values[mid_elem];
result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
- arrdatum, cur);
+ arrdatum, false, cur);
if (result == 0)
{
@@ -1326,13 +1908,123 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
*/
if (low_elem != mid_elem)
result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
- array->elem_values[low_elem], cur);
+ array->elem_values[low_elem], false,
+ cur);
*set_elem_result = result;
return low_elem;
}
+/*
+ * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array
+ *
+ * Skip scan arrays procedurally generate their elements on-demand. They
+ * largely function in the same way as standard arrays. They can be rolled
+ * over by standard arrays (standard array can also roll over skip arrays).
+ *
+ * This routine doesn't return an index into the array, because the array
+ * doesn't actually have any elements (it has low_value and high_value, which
+ * indicate the range of values that the array can generate). Note that this
+ * may include a NULL value/an IS NULL qual (unlike with true arrays).
+ *
+ * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true
+ * array. The value 0 indicates that tupdatum/tupnull is within the range of
+ * the skip array. Other values indicate what _bt_compare_array_skey returned
+ * for the best available match to tupdatum/tupnull (in practice this means
+ * either the lowest item or the highest item in the range of the array).
+ *
+ * cur_elem_trig indicates if array advancement was triggered by this skip
+ * array's scan key. We can apply this information to find the next matching
+ * array element in the current scan direction using fewer comparisons.
+ */
+static void
+_bt_binsrch_skiparray_skey(FmgrInfo *orderproc,
+ bool cur_elem_trig, ScanDirection dir,
+ Datum tupdatum, bool tupnull,
+ BTArrayKeyInfo *array, ScanKey cur,
+ int32 *set_elem_result)
+{
+ Datum arrdatum;
+ bool arrnull;
+
+ Assert(!ScanDirectionIsNoMovement(dir));
+ Assert(cur->sk_flags & SK_BT_SKIP);
+ Assert(cur->sk_flags & SK_SEARCHARRAY);
+ Assert(cur->sk_flags & SK_BT_REQFWD);
+ Assert(array->num_elems == -1);
+
+ /*
+ * Compare tupdatum against "first array element" in the current scan
+ * direction first (and allow NULL to be treated as a possible element).
+ *
+ * Optimization: don't have to bother with this when passed a skip array
+ * that is known to have triggered array advancement.
+ */
+ if (!cur_elem_trig)
+ {
+ if (ScanDirectionIsForward(dir))
+ {
+ arrdatum = array->sksup.low_elem;
+ arrnull = array->null_elem && (cur->sk_flags & SK_BT_NULLS_FIRST);
+ }
+ else
+ {
+ arrdatum = array->sksup.high_elem;
+ arrnull = array->null_elem && !(cur->sk_flags & SK_BT_NULLS_FIRST);
+ }
+
+ *set_elem_result = _bt_compare_array_skey(orderproc,
+ tupdatum, tupnull,
+ arrdatum, arrnull,
+ cur);
+
+ /*
+ * Optimization: return early when >= lower bound happens to be an
+ * exact match (or when <= upper bound is an exact match during a
+ * backwards scan)
+ */
+ if (*set_elem_result == 0)
+ return;
+
+ /* Is tupdatum "before the start" of our lowest "element"? */
+ if ((ScanDirectionIsForward(dir) && *set_elem_result < 0) ||
+ (ScanDirectionIsBackward(dir) && *set_elem_result > 0))
+ return;
+ }
+
+ /*
+ * Now compare tupdatum to the "last array element" in the current scan
+ * direction (and allow NULL to be treated as a possible element)
+ */
+ if (ScanDirectionIsForward(dir))
+ {
+ arrdatum = array->sksup.high_elem;
+ arrnull = array->null_elem && !(cur->sk_flags & SK_BT_NULLS_FIRST);
+ }
+ else
+ {
+ arrdatum = array->sksup.low_elem;
+ arrnull = array->null_elem && (cur->sk_flags & SK_BT_NULLS_FIRST);
+ }
+
+ *set_elem_result = _bt_compare_array_skey(orderproc,
+ tupdatum, tupnull,
+ arrdatum, arrnull,
+ cur);
+
+ /* Is tupdatum "after the end" of our highest "element"? */
+ if ((ScanDirectionIsForward(dir) && *set_elem_result > 0) ||
+ (ScanDirectionIsBackward(dir) && *set_elem_result < 0))
+ return;
+
+ /*
+ * tupdatum must be within the range of the skip array. Have our caller
+ * treat tupdatum as one of the array's elements.
+ */
+ *set_elem_result = 0;
+}
+
/*
* _bt_start_array_keys() -- Initialize array keys at start of a scan
*
@@ -1342,29 +2034,257 @@ _bt_binsrch_array_skey(FmgrInfo *orderproc,
void
_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int i;
Assert(so->numArrayKeys);
Assert(so->qual_ok);
- for (i = 0; i < so->numArrayKeys; i++)
+ for (int i = 0; i < so->numArrayKeys; i++)
{
BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
ScanKey skey = &so->keyData[curArrayKey->scan_key];
- Assert(curArrayKey->num_elems > 0);
Assert(skey->sk_flags & SK_SEARCHARRAY);
- if (ScanDirectionIsBackward(dir))
- curArrayKey->cur_elem = curArrayKey->num_elems - 1;
- else
- curArrayKey->cur_elem = 0;
- skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
+ _bt_scankey_set_low_or_high(rel, skey, curArrayKey,
+ ScanDirectionIsForward(dir));
}
so->scanBehind = false;
}
+/*
+ * _bt_scankey_decrement() -- decrement scan key's sk_argument
+ *
+ * Unsets scan key "IS NULL" flags when required, and handles memory
+ * management for pass-by-reference types.
+ */
+static void
+_bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+
+ if (skey->sk_flags & SK_ISNULL)
+ _bt_scankey_unset_isnull(rel, skey, array);
+ else
+ {
+ Datum dec_sk_argument;
+ Form_pg_attribute attr;
+
+ /* Get a decremented copy of existing sk_argument */
+ dec_sk_argument = _bt_apply_decrement(rel, skey, array);
+
+ /* Free memory previously allocated for sk_argument if needed */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ if (!attr->attbyval && skey->sk_argument)
+ pfree(DatumGetPointer(skey->sk_argument));
+
+ /* Set decremented copy of original sk_argument in scan key */
+ skey->sk_argument = dec_sk_argument;
+ }
+}
+
+/*
+ * _bt_scankey_increment() -- increment scan key's sk_argument
+ *
+ * Unsets scan key "IS NULL" flags when required, and handles memory
+ * management for pass-by-reference types.
+ */
+static void
+_bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+
+ if (skey->sk_flags & SK_ISNULL)
+ _bt_scankey_unset_isnull(rel, skey, array);
+ else
+ {
+ Datum inc_sk_argument;
+ Form_pg_attribute attr;
+
+ /* Get an incremented copy of existing sk_argument */
+ inc_sk_argument = _bt_apply_increment(rel, skey, array);
+
+ /* Free memory previously allocated for sk_argument if needed */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ if (!attr->attbyval && skey->sk_argument)
+ pfree(DatumGetPointer(skey->sk_argument));
+
+ /* Set incremented copy of original sk_argument in scan key */
+ skey->sk_argument = inc_sk_argument;
+ }
+}
+
+/*
+ * _bt_scankey_set_low_or_high() -- Set array scan key to lowest/highest element
+ *
+ * Caller also passes associated scan key, which will have its argument set to
+ * the lowest/highest array value in passing.
+ */
+static void
+_bt_scankey_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array,
+ bool low_not_high)
+{
+ Form_pg_attribute attr;
+
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+
+ if (array->num_elems != -1)
+ {
+ /* set low or high element for conventional array */
+ int set_elem = 0;
+
+ Assert(!(skey->sk_flags & SK_BT_SKIP));
+
+ if (!low_not_high)
+ set_elem = array->num_elems - 1;
+
+ /*
+ * Just copy over array datum (only skip arrays require freeing and
+ * allocating memory for sk_argument)
+ */
+ array->cur_elem = set_elem;
+ skey->sk_argument = array->elem_values[set_elem];
+
+ return;
+ }
+
+ /* set low or high element for skip array */
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(array->num_elems == -1);
+
+ /* Free memory previously allocated for sk_argument if needed */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ if (!attr->attbyval && skey->sk_argument)
+ pfree(DatumGetPointer(skey->sk_argument));
+
+ if (array->null_elem &&
+ (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0)))
+ {
+ /* Set element to NULL (lowest/highest element) */
+ skey->sk_argument = (Datum) 0;
+ skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL);
+ }
+ else
+ {
+ /* Lowest array element isn't NULL */
+ if (low_not_high)
+ skey->sk_argument = datumCopy(array->sksup.low_elem,
+ attr->attbyval, attr->attlen);
+ else
+ skey->sk_argument = datumCopy(array->sksup.high_elem,
+ attr->attbyval, attr->attlen);
+
+ skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL);
+ }
+}
+
+/*
+ * _bt_scankey_set_element() -- Set skip array scan key's sk_argument
+ *
+ * Sets scan key to "IS NULL" when required, and handles memory management for
+ * pass-by-reference types.
+ */
+static void
+_bt_scankey_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array,
+ Datum tupdatum, bool tupnull)
+{
+ /* tupdatum within the range of low_value/high_value */
+ Form_pg_attribute attr;
+
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+
+ /* Free memory previously allocated for sk_argument if needed */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ if (!attr->attbyval && skey->sk_argument)
+ pfree(DatumGetPointer(skey->sk_argument));
+
+ /*
+ * Treat tupdatum/tupnull as a matching array element.
+ *
+ * We just copy tupdatum into the array's scan key (there is no
+ * conventional array element for us to set, of course).
+ */
+ if (tupnull)
+ {
+ /*
+ * Unlike standard arrays, skip arrays sometimes need to locate NULLs.
+ * We can treat them as just another value from the domain of indexed
+ * values.
+ */
+ Assert(array->null_elem);
+ Assert(!(skey->sk_flags & (SK_SEARCHNULL | SK_ISNULL)));
+
+ skey->sk_argument = (Datum) 0;
+ skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL);
+ }
+ else
+ {
+ skey->sk_argument = datumCopy(tupdatum,
+ attr->attbyval, attr->attlen);
+ skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL);
+ }
+}
+
+/*
+ * _bt_scankey_unset_isnull() -- increment/decrement scan key from NULL
+ *
+ * Unsets scan key's "IS NULL" marking, and sets the non-NULL value from the
+ * array immediately before (or immediate after) NULL in the key space.
+ */
+static void
+_bt_scankey_unset_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ Form_pg_attribute attr;
+
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+ Assert(skey->sk_flags & SK_ISNULL);
+ Assert(array->null_elem);
+
+ /*
+ * sk_argument must be set to whatever non-NULL value comes immediately
+ * before or after NULL
+ */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL);
+ if (skey->sk_flags & SK_BT_NULLS_FIRST)
+ skey->sk_argument = datumCopy(array->sksup.low_elem,
+ attr->attbyval, attr->attlen);
+ else
+ skey->sk_argument = datumCopy(array->sksup.high_elem,
+ attr->attbyval, attr->attlen);
+}
+
+/*
+ * _bt_scankey_set_isnull() -- increment/decrement scan key to NULL
+ *
+ * Sets scan key to "IS NULL", and handles memory management for
+ * pass-by-reference types.
+ */
+static void
+_bt_scankey_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
+{
+ Form_pg_attribute attr;
+
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+ Assert(!(skey->sk_flags & SK_ISNULL));
+ Assert(array->null_elem);
+
+ /* Free memory previously allocated for sk_argument if needed */
+ attr = TupleDescAttr(RelationGetDescr(rel), skey->sk_attno - 1);
+ if (!attr->attbyval && skey->sk_argument)
+ pfree(DatumGetPointer(skey->sk_argument));
+
+ /* Set sk_argument to NULL */
+ skey->sk_argument = (Datum) 0;
+ skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL);
+}
+
/*
* _bt_advance_array_keys_increment() -- Advance to next set of array elements
*
@@ -1380,6 +2300,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
static bool
_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
/*
@@ -1391,10 +2312,24 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
{
BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
ScanKey skey = &so->keyData[curArrayKey->scan_key];
+ FmgrInfo *orderproc = &so->orderProcs[curArrayKey->scan_key];
int cur_elem = curArrayKey->cur_elem;
int num_elems = curArrayKey->num_elems;
bool rolled = false;
+ /* Handle incrementing a skip array */
+ if (num_elems == -1)
+ {
+ /* Attempt to incrementally advance this skip scan array */
+ if (_bt_advance_skip_array_key_increment(rel, dir, curArrayKey,
+ skey, orderproc))
+ return true;
+
+ /* Array rolled over. Need to advance next array key, if any. */
+ continue;
+ }
+
+ /* Handle incrementing a true array */
if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems)
{
cur_elem = 0;
@@ -1411,7 +2346,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
if (!rolled)
return true;
- /* Need to advance next array key, if any */
+ /* Array rolled over. Need to advance next array key, if any. */
}
/*
@@ -1429,6 +2364,95 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
return false;
}
+/*
+ * _bt_advance_skip_array_key_increment() -- increment a skip scan array
+ *
+ * Returns true when the skip array was successfully incremented to the next
+ * value in the current scan direction, dir. Otherwise handles roll over by
+ * setting array to its final element for the current scan direction.
+ */
+static bool
+_bt_advance_skip_array_key_increment(Relation rel, ScanDirection dir,
+ BTArrayKeyInfo *array, ScanKey skey,
+ FmgrInfo *orderproc)
+{
+ Datum sk_argument = skey->sk_argument;
+ bool sk_isnull = (skey->sk_flags & SK_ISNULL) != 0;
+ int compare;
+
+ Assert(skey->sk_flags & SK_BT_SKIP);
+ Assert(skey->sk_flags & SK_SEARCHARRAY);
+ Assert(array->num_elems == -1);
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* high_elem is final non-NULL element in current scan direction */
+ compare = _bt_compare_array_skey(orderproc,
+ array->sksup.high_elem, false,
+ sk_argument, sk_isnull,
+ skey);
+ if (compare > 0)
+ {
+ /* Increment non-NULL element to next non-NULL element */
+ _bt_scankey_increment(rel, skey, array);
+
+ return true;
+ }
+ else if (compare == 0 && array->null_elem &&
+ !(skey->sk_flags & SK_BT_NULLS_FIRST))
+ {
+ /*
+ * Existing sk_argument was already equal to high_elem. Increment
+ * from high_elem to final NULL element (without calling opclass
+ * support function, which doesn't know how to handle NULLs).
+ */
+ _bt_scankey_set_isnull(rel, skey, array);
+
+ return true;
+ }
+
+ /* Exhausted all array elements in current scan direction */
+ }
+ else
+ {
+ /* low_elem is final non-NULL element in current scan direction */
+ compare = _bt_compare_array_skey(orderproc,
+ array->sksup.low_elem, false,
+ sk_argument, sk_isnull,
+ skey);
+ if (compare < 0)
+ {
+ /* Decrement non-NULL element to previous non-NULL element */
+ _bt_scankey_decrement(rel, skey, array);
+
+ return true;
+ }
+ else if (compare == 0 && array->null_elem &&
+ (skey->sk_flags & SK_BT_NULLS_FIRST))
+ {
+ /*
+ * Existing sk_argument was already equal to low_elem. Decrement
+ * from low_elem to final NULL element (without calling opclass
+ * support function, which doesn't know how to handle NULLs).
+ */
+ _bt_scankey_set_isnull(rel, skey, array);
+
+ return true;
+ }
+
+ /* Exhausted all array elements in current scan direction */
+ }
+
+ /*
+ * Skip array rolls over. Start over at the array's lowest sorting value
+ * (or its highest value, for backward scans).
+ */
+ _bt_scankey_set_low_or_high(rel, skey, array, ScanDirectionIsForward(dir));
+
+ /* Caller must consider earlier/more significant arrays in turn */
+ return false;
+}
+
/*
* _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays
*
@@ -1466,6 +2490,7 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir)
static void
_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int arrayidx = 0;
@@ -1473,7 +2498,6 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
{
ScanKey cur = so->keyData + ikey;
BTArrayKeyInfo *array = NULL;
- int first_elem_dir;
if (!(cur->sk_flags & SK_SEARCHARRAY) ||
cur->sk_strategy != BTEqualStrategyNumber)
@@ -1485,16 +2509,10 @@ _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
continue;
- if (ScanDirectionIsForward(dir))
- first_elem_dir = 0;
- else
- first_elem_dir = array->num_elems - 1;
+ Assert(array->num_elems != -1); /* No skipping of non-required arrays */
- if (array->cur_elem != first_elem_dir)
- {
- array->cur_elem = first_elem_dir;
- cur->sk_argument = array->elem_values[first_elem_dir];
- }
+ _bt_scankey_set_low_or_high(rel, cur, array,
+ ScanDirectionIsForward(dir));
}
}
@@ -1558,6 +2576,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++)
{
ScanKey cur = so->keyData + ikey;
+ Datum sk_argument = cur->sk_argument;
+ bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0;
Datum tupdatum;
bool tupnull;
int32 result;
@@ -1621,7 +2641,8 @@ _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
result = _bt_compare_array_skey(&so->orderProcs[ikey],
tupdatum, tupnull,
- cur->sk_argument, cur);
+ sk_argument, sk_isnull,
+ cur);
/*
* Does this comparison indicate that caller must _not_ advance the
@@ -1954,18 +2975,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
*/
if (beyond_end_advance)
{
- int final_elem_dir;
-
- if (ScanDirectionIsBackward(dir) || !array)
- final_elem_dir = 0;
- else
- final_elem_dir = array->num_elems - 1;
-
- if (array && array->cur_elem != final_elem_dir)
- {
- array->cur_elem = final_elem_dir;
- cur->sk_argument = array->elem_values[final_elem_dir];
- }
+ if (array)
+ _bt_scankey_set_low_or_high(rel, cur, array,
+ ScanDirectionIsBackward(dir));
continue;
}
@@ -1990,18 +3002,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
*/
if (!all_required_satisfied || cur->sk_attno > tupnatts)
{
- int first_elem_dir;
-
- if (ScanDirectionIsForward(dir) || !array)
- first_elem_dir = 0;
- else
- first_elem_dir = array->num_elems - 1;
-
- if (array && array->cur_elem != first_elem_dir)
- {
- array->cur_elem = first_elem_dir;
- cur->sk_argument = array->elem_values[first_elem_dir];
- }
+ if (array)
+ _bt_scankey_set_low_or_high(rel, cur, array,
+ ScanDirectionIsForward(dir));
continue;
}
@@ -2019,15 +3022,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
/*
* Binary search for closest match that's available from the array
*/
- set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
- cur_elem_trig, dir,
- tupdatum, tupnull, array, cur,
- &result);
+ if (array->num_elems != -1)
+ set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey],
+ cur_elem_trig, dir,
+ tupdatum, tupnull, array, cur,
+ &result);
- Assert(set_elem >= 0 && set_elem < array->num_elems);
+ /*
+ * Skip array. "Binary search" by checking if tupdatum/tupnull
+ * are within the low_value/high_value range of the skip array.
+ */
+ else
+ _bt_binsrch_skiparray_skey(&so->orderProcs[ikey],
+ cur_elem_trig, dir,
+ tupdatum, tupnull, array, cur,
+ &result);
}
else
{
+ Datum sk_argument = cur->sk_argument;
+ bool sk_isnull = (cur->sk_flags & SK_ISNULL) != 0;
+
Assert(sktrig_required && required);
/*
@@ -2041,7 +3056,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
*/
result = _bt_compare_array_skey(&so->orderProcs[ikey],
tupdatum, tupnull,
- cur->sk_argument, cur);
+ sk_argument, sk_isnull, cur);
}
/*
@@ -2061,6 +3076,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
* its final element is. Once outside the loop we'll then "increment
* this array's set_elem" by calling _bt_advance_array_keys_increment.
* That way the process rolls over to higher order arrays as needed.
+ * The skip array case will set the array's scan key to the final
+ * valid element for the current scan direction, which is equivalent
+ * (when we have a real set_elem "match" it's just the final element
+ * in the current scan direction).
*
* Under this scheme any required arrays only ever ratchet forwards
* (or backwards), and always do so to the maximum possible extent
@@ -2100,11 +3119,62 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
}
}
- /* Advance array keys, even when set_elem isn't an exact match */
- if (array && array->cur_elem != set_elem)
+ /* Advance array keys, even when we don't have an exact match */
+
+ if (!array)
+ continue; /* no element to set in non-array */
+
+ /* Conventional arrays have a valid set_elem for us to advance to */
+ if (array->num_elems != -1)
{
- array->cur_elem = set_elem;
- cur->sk_argument = array->elem_values[set_elem];
+ if (array->cur_elem != set_elem)
+ {
+ array->cur_elem = set_elem;
+ cur->sk_argument = array->elem_values[set_elem];
+ }
+
+ continue;
+ }
+
+ /*
+ * Conceptually, skip arrays also have array elements. The actual
+ * elements/values are generated procedurally and on demand.
+ */
+ Assert(cur->sk_flags & SK_BT_SKIP);
+ Assert(array->num_elems == -1);
+ Assert(required);
+
+ if (result == 0)
+ {
+ /*
+ * Anything within the range of possible element values is treated
+ * as "a match for one of the array's elements". Store the next
+ * scan key argument value by taking a copy of the tupdatum value
+ * from caller's tuple (or set scan key IS NULL when tupnull, iff
+ * the array's range of possible elements covers NULL).
+ */
+ _bt_scankey_set_element(rel, cur, array, tupdatum, tupnull);
+ }
+ else if (beyond_end_advance)
+ {
+ /*
+ * We need to set the array element to the final "element" in the
+ * current scan direction for "beyond end of array element" array
+ * advancement. See above for an explanation.
+ */
+ _bt_scankey_set_low_or_high(rel, cur, array,
+ ScanDirectionIsBackward(dir));
+ }
+ else
+ {
+ /*
+ * The closest matching element is the lowest element; even that
+ * still puts us ahead of caller's tuple in the key space. This
+ * process has to carry to any lower-order arrays. See above for
+ * an explanation.
+ */
+ _bt_scankey_set_low_or_high(rel, cur, array,
+ ScanDirectionIsForward(dir));
}
}
@@ -2460,10 +3530,12 @@ end_toplevel_scan:
/*
* _bt_preprocess_keys() -- Preprocess scan keys
*
+ * The first call here (per btrescan) allocates so->keyData[].
* The given search-type keys (taken from scan->keyData[])
* are copied to so->keyData[] with possible transformation.
* scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
- * the number of output keys (possibly less, never greater).
+ * the number of output keys. Calling here a second time (during the same
+ * btrescan) is a no-op.
*
* The output keys are marked with additional sk_flags bits beyond the
* system-standard bits supplied by the caller. The DESC and NULLS_FIRST
@@ -2483,6 +3555,8 @@ end_toplevel_scan:
* within each attribute may be done as a byproduct of the processing here.
* That process must leave array scan keys (within an attribute) in the same
* order as corresponding entries from the scan's BTArrayKeyInfo array info.
+ * We might also cons up skip array scan keys that weren't present in the
+ * original input keys; these are also output in standard attribute order.
*
* The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
* if they must be satisfied in order to continue the scan forward or backward
@@ -2550,9 +3624,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
int16 *indoption = scan->indexRelation->rd_indoption;
int new_numberOfKeys;
int numberOfEqualCols;
- ScanKey inkeys;
- ScanKey outkeys;
- ScanKey cur;
+ ScanKey inputsk;
BTScanKeyPreproc xform[BTMaxStrategyNumber];
bool test_result;
int i,
@@ -2584,7 +3656,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
return; /* done if qual-less scan */
/* If any keys are SK_SEARCHARRAY type, set up array-key info */
- arrayKeyData = _bt_preprocess_array_keys(scan);
+ arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
if (!so->qual_ok)
{
/* unmatchable array, so give up */
@@ -2598,32 +3670,36 @@ _bt_preprocess_keys(IndexScanDesc scan)
*/
if (arrayKeyData)
{
- inkeys = arrayKeyData;
+ inputsk = arrayKeyData;
/* Also maintain keyDataMap for remapping so->orderProc[] later */
keyDataMap = MemoryContextAlloc(so->arrayContext,
numberOfKeys * sizeof(int));
}
else
- inkeys = scan->keyData;
+ inputsk = scan->keyData;
+
+ /*
+ * Now that we have an estimate of the number of output scan keys
+ * (including any skip array scan keys), allocate space for them
+ */
+ so->keyData = palloc(sizeof(ScanKeyData) * numberOfKeys);
- outkeys = so->keyData;
- cur = &inkeys[0];
/* we check that input keys are correctly ordered */
- if (cur->sk_attno < 1)
+ if (inputsk->sk_attno < 1)
elog(ERROR, "btree index keys must be ordered by attribute");
/* We can short-circuit most of the work if there's just one key */
if (numberOfKeys == 1)
{
/* Apply indoption to scankey (might change sk_strategy!) */
- if (!_bt_fix_scankey_strategy(cur, indoption))
+ if (!_bt_fix_scankey_strategy(inputsk, indoption))
so->qual_ok = false;
- memcpy(outkeys, cur, sizeof(ScanKeyData));
+ memcpy(so->keyData, inputsk, sizeof(ScanKeyData));
so->numberOfKeys = 1;
/* We can mark the qual as required if it's for first index col */
- if (cur->sk_attno == 1)
- _bt_mark_scankey_required(outkeys);
+ if (inputsk->sk_attno == 1)
+ _bt_mark_scankey_required(so->keyData);
if (arrayKeyData)
{
/*
@@ -2631,8 +3707,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
* (we'll miss out on the single value array transformation, but
* that's not nearly as important when there's only one scan key)
*/
- Assert(cur->sk_flags & SK_SEARCHARRAY);
- Assert(cur->sk_strategy != BTEqualStrategyNumber ||
+ Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY);
+ Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber ||
(so->arrayKeys[0].scan_key == 0 &&
OidIsValid(so->orderProcs[0].fn_oid)));
}
@@ -2660,12 +3736,12 @@ _bt_preprocess_keys(IndexScanDesc scan)
* handle after-last-key processing. Actual exit from the loop is at the
* "break" statement below.
*/
- for (i = 0;; cur++, i++)
+ for (i = 0;; inputsk++, i++)
{
if (i < numberOfKeys)
{
/* Apply indoption to scankey (might change sk_strategy!) */
- if (!_bt_fix_scankey_strategy(cur, indoption))
+ if (!_bt_fix_scankey_strategy(inputsk, indoption))
{
/* NULL can't be matched, so give up */
so->qual_ok = false;
@@ -2677,12 +3753,12 @@ _bt_preprocess_keys(IndexScanDesc scan)
* If we are at the end of the keys for a particular attr, finish up
* processing and emit the cleaned-up keys.
*/
- if (i == numberOfKeys || cur->sk_attno != attno)
+ if (i == numberOfKeys || inputsk->sk_attno != attno)
{
int priorNumberOfEqualCols = numberOfEqualCols;
/* check input keys are correctly ordered */
- if (i < numberOfKeys && cur->sk_attno < attno)
+ if (i < numberOfKeys && inputsk->sk_attno < attno)
elog(ERROR, "btree index keys must be ordered by attribute");
/*
@@ -2741,7 +3817,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
return;
}
/* else discard the redundant non-equality key */
- Assert(!array || array->num_elems > 0);
+ Assert(!array || array->num_elems > 0 ||
+ array->num_elems == -1);
xform[j].skey = NULL;
xform[j].ikey = -1;
}
@@ -2786,7 +3863,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
}
/*
- * Emit the cleaned-up keys into the outkeys[] array, and then
+ * Emit the cleaned-up keys into the so->keyData[] array, and then
* mark them if they are required. They are required (possibly
* only in one direction) if all attrs before this one had "=".
*/
@@ -2794,7 +3871,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
{
if (xform[j].skey)
{
- ScanKey outkey = &outkeys[new_numberOfKeys++];
+ ScanKey outkey = &so->keyData[new_numberOfKeys++];
memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
if (arrayKeyData)
@@ -2811,19 +3888,19 @@ _bt_preprocess_keys(IndexScanDesc scan)
break;
/* Re-initialize for new attno */
- attno = cur->sk_attno;
+ attno = inputsk->sk_attno;
memset(xform, 0, sizeof(xform));
}
/* check strategy this key's operator corresponds to */
- j = cur->sk_strategy - 1;
+ j = inputsk->sk_strategy - 1;
/* if row comparison, push it directly to the output array */
- if (cur->sk_flags & SK_ROW_HEADER)
+ if (inputsk->sk_flags & SK_ROW_HEADER)
{
- ScanKey outkey = &outkeys[new_numberOfKeys++];
+ ScanKey outkey = &so->keyData[new_numberOfKeys++];
- memcpy(outkey, cur, sizeof(ScanKeyData));
+ memcpy(outkey, inputsk, sizeof(ScanKeyData));
if (arrayKeyData)
keyDataMap[new_numberOfKeys - 1] = i;
if (numberOfEqualCols == attno - 1)
@@ -2837,19 +3914,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
continue;
}
- /*
- * Does this input scan key require further processing as an array?
- */
- if (cur->sk_strategy == InvalidStrategy)
- {
- /* _bt_preprocess_array_keys marked this array key redundant */
- Assert(arrayKeyData);
- Assert(cur->sk_flags & SK_SEARCHARRAY);
- continue;
- }
-
- if (cur->sk_strategy == BTEqualStrategyNumber &&
- (cur->sk_flags & SK_SEARCHARRAY))
+ if (inputsk->sk_strategy == BTEqualStrategyNumber &&
+ (inputsk->sk_flags & SK_SEARCHARRAY))
{
/* _bt_preprocess_array_keys kept this array key */
Assert(arrayKeyData);
@@ -2863,7 +3929,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
if (xform[j].skey == NULL)
{
/* nope, so this scan key wins by default (at least for now) */
- xform[j].skey = cur;
+ xform[j].skey = inputsk;
xform[j].ikey = i;
xform[j].arrayidx = arrayidx;
}
@@ -2881,7 +3947,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
/*
* Have to set up array keys
*/
- if ((cur->sk_flags & SK_SEARCHARRAY))
+ if ((inputsk->sk_flags & SK_SEARCHARRAY))
{
array = &so->arrayKeys[arrayidx - 1];
orderproc = so->orderProcs + i;
@@ -2909,13 +3975,15 @@ _bt_preprocess_keys(IndexScanDesc scan)
*/
}
- if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey,
- array, orderproc, &test_result))
+ if (_bt_compare_scankey_args(scan, inputsk, inputsk,
+ xform[j].skey, array, orderproc,
+ &test_result))
{
/* Have all we need to determine redundancy */
if (test_result)
{
- Assert(!array || array->num_elems > 0);
+ Assert(!array || array->num_elems > 0 ||
+ array->num_elems == -1);
/*
* New key is more restrictive, and so replaces old key...
@@ -2923,7 +3991,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
if (j != (BTEqualStrategyNumber - 1) ||
!(xform[j].skey->sk_flags & SK_SEARCHARRAY))
{
- xform[j].skey = cur;
+ xform[j].skey = inputsk;
xform[j].ikey = i;
xform[j].arrayidx = arrayidx;
}
@@ -2936,7 +4004,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
* scan key. _bt_compare_scankey_args expects us to
* always keep arrays (and discard non-arrays).
*/
- Assert(!(cur->sk_flags & SK_SEARCHARRAY));
+ Assert(!(inputsk->sk_flags & SK_SEARCHARRAY));
}
}
else if (j == (BTEqualStrategyNumber - 1))
@@ -2959,14 +4027,14 @@ _bt_preprocess_keys(IndexScanDesc scan)
* even with incomplete opfamilies. _bt_advance_array_keys
* depends on this.
*/
- ScanKey outkey = &outkeys[new_numberOfKeys++];
+ ScanKey outkey = &so->keyData[new_numberOfKeys++];
memcpy(outkey, xform[j].skey, sizeof(ScanKeyData));
if (arrayKeyData)
keyDataMap[new_numberOfKeys - 1] = xform[j].ikey;
if (numberOfEqualCols == attno - 1)
_bt_mark_scankey_required(outkey);
- xform[j].skey = cur;
+ xform[j].skey = inputsk;
xform[j].ikey = i;
xform[j].arrayidx = arrayidx;
}
@@ -3057,10 +4125,11 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
if (array->scan_key != ikey)
return false;
- if (array->num_elems <= 0)
+ if (array->num_elems == 0 || array->num_elems < -1)
return false;
- if (cur->sk_argument != array->elem_values[array->cur_elem])
+ if (array->num_elems != -1 &&
+ cur->sk_argument != array->elem_values[array->cur_elem])
return false;
if (last_sk_attno > cur->sk_attno)
return false;
@@ -3135,6 +4204,22 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
bool leftnull,
rightnull;
+ /* Handle skip array comparison with IS NOT NULL scan key */
+ if ((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)
+ {
+ /* Shouldn't generate skip array in presence of IS NULL key */
+ Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNULL));
+ Assert((leftarg->sk_flags | rightarg->sk_flags) & SK_SEARCHNOTNULL);
+
+ /* Don't allow skip array to generate IS NULL scan key/element */
+ Assert(array->num_elems == -1);
+ array->null_elem = false;
+
+ /* IS NOT NULL key (could be leftarg or rightarg) now redundant */
+ *result = true;
+ return true;
+ }
+
if (leftarg->sk_flags & SK_ISNULL)
{
Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
@@ -3208,6 +4293,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
{
/* Can't make the comparison */
*result = false; /* suppress compiler warnings */
+ Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP));
return false;
}
@@ -3380,13 +4466,6 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
return true;
}
- if (skey->sk_strategy == InvalidStrategy)
- {
- /* Already-eliminated array scan key; don't need to fix anything */
- Assert(skey->sk_flags & SK_SEARCHARRAY);
- return true;
- }
-
/* Adjust strategy for DESC, if we didn't already */
if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
index e9d4cd60d..96d0d9185 100644
--- a/src/backend/access/nbtree/nbtvalidate.c
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -114,6 +114,10 @@ btvalidate(Oid opclassoid)
case BTOPTIONS_PROC:
ok = check_amoptsproc_signature(procform->amproc);
break;
+ case BTSKIPSUPPORT_PROC:
+ ok = check_amproc_signature(procform->amproc, VOIDOID, true,
+ 1, 1, INTERNALOID);
+ break;
default:
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index b8b5c147c..a86dbf71b 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -1330,6 +1330,31 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("btree equal image functions must not be cross-type")));
}
+ else if (member->number == BTSKIPSUPPORT_PROC)
+ {
+ if (procform->pronargs != 1 ||
+ procform->proargtypes.values[0] != INTERNALOID)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("btree skip support functions must accept type \"internal\"")));
+ if (procform->prorettype != VOIDOID)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("btree skip support functions must return void")));
+
+ /*
+ * pg_amproc functions are indexed by (lefttype, righttype), but a
+ * skip support function doesn't make sense in cross-type
+ * scenarios. The same opclass opcintype OID is always used for
+ * lefttype and righttype. Providing a cross-type routine isn't
+ * sensible. Reject cross-type ALTER OPERATOR FAMILY ... ADD
+ * FUNCTION 6 statements here.
+ */
+ if (member->lefttype != member->righttype)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("btree skip support functions must not be cross-type")));
+ }
}
else if (amoid == HASH_AM_OID)
{
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index edb09d4e3..e945686c8 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -96,6 +96,7 @@ OBJS = \
rowtypes.o \
ruleutils.o \
selfuncs.o \
+ skipsupport.o \
tid.o \
timestamp.o \
trigfuncs.o \
diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c
index 9c854e0e5..ea3d0f4b5 100644
--- a/src/backend/utils/adt/date.c
+++ b/src/backend/utils/adt/date.c
@@ -34,6 +34,7 @@
#include "utils/date.h"
#include "utils/datetime.h"
#include "utils/numeric.h"
+#include "utils/skipsupport.h"
#include "utils/sortsupport.h"
/*
@@ -455,6 +456,39 @@ date_sortsupport(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+static Datum
+date_decrement(Relation rel, Datum existing)
+{
+ DateADT dexisting = DatumGetDateADT(existing);
+
+ Assert(dexisting > DATEVAL_NOBEGIN);
+
+ return DateADTGetDatum(dexisting - 1);
+}
+
+static Datum
+date_increment(Relation rel, Datum existing)
+{
+ DateADT dexisting = DatumGetDateADT(existing);
+
+ Assert(dexisting < DATEVAL_NOEND);
+
+ return DateADTGetDatum(dexisting + 1);
+}
+
+Datum
+date_skipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+
+ sksup->decrement = date_decrement;
+ sksup->increment = date_increment;
+ sksup->low_elem = DateADTGetDatum(DATEVAL_NOBEGIN);
+ sksup->high_elem = DateADTGetDatum(DATEVAL_NOEND);
+
+ PG_RETURN_VOID();
+}
+
Datum
date_finite(PG_FUNCTION_ARGS)
{
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8c6fc80c3..91682edd5 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -83,6 +83,7 @@ backend_sources += files(
'rowtypes.c',
'ruleutils.c',
'selfuncs.c',
+ 'skipsupport.c',
'tid.c',
'timestamp.c',
'trigfuncs.c',
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 5f5d7959d..c1df7be9f 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6800,6 +6800,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
List *indexBoundQuals;
int indexcol;
bool eqQualHere;
+ bool found_skip;
bool found_saop;
bool found_is_null_op;
double num_sa_scans;
@@ -6825,6 +6826,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
indexBoundQuals = NIL;
indexcol = 0;
eqQualHere = false;
+ found_skip = false;
found_saop = false;
found_is_null_op = false;
num_sa_scans = 1;
@@ -6833,15 +6835,38 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
IndexClause *iclause = lfirst_node(IndexClause, lc);
ListCell *lc2;
+ /*
+ * XXX For now we just cost skip scans via generic rules: make a
+ * uniform assumption that there will be 10 primitive index scans per
+ * skipped attribute, relying on the "1/3 of all index pages" cap that
+ * this costing has used since Postgres 17. Also assume that skipping
+ * won't take place for an index that has fewer than 100 pages.
+ *
+ * The current approach to costing leaves much to be desired, but is
+ * at least better than nothing at all (keeping the code as it is on
+ * HEAD just makes testing and review inconvenient).
+ */
if (indexcol != iclause->indexcol)
{
/* Beginning of a new column's quals */
if (!eqQualHere)
- break; /* done if no '=' qual for indexcol */
+ {
+ found_skip = true; /* skip when no '=' qual for indexcol */
+ if (index->pages < 100)
+ break;
+ num_sa_scans += 10;
+ }
eqQualHere = false;
indexcol++;
if (indexcol != iclause->indexcol)
- break; /* no quals at all for indexcol */
+ {
+ /* no quals at all for indexcol */
+ found_skip = true;
+ if (index->pages < 100)
+ break;
+ num_sa_scans += 10 * (indexcol - iclause->indexcol);
+ continue;
+ }
}
/* Examine each indexqual associated with this index clause */
@@ -6914,6 +6939,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
if (index->unique &&
indexcol == index->nkeycolumns - 1 &&
eqQualHere &&
+ !found_skip &&
!found_saop &&
!found_is_null_op)
numIndexTuples = 1.0;
diff --git a/src/backend/utils/adt/skipsupport.c b/src/backend/utils/adt/skipsupport.c
new file mode 100644
index 000000000..9665e4985
--- /dev/null
+++ b/src/backend/utils/adt/skipsupport.c
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * skipsupport.c
+ * Support routines for B-Tree skip scans.
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/skipsupport.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/nbtree.h"
+#include "utils/lsyscache.h"
+#include "utils/skipsupport.h"
+
+/*
+ * Fill in SkipSupport given an operator class (opfamily + opcintype).
+ *
+ * On success, returns true, and initializes all SkipSupport fields for
+ * caller. Otherwise returns false, indicating that operator class has no
+ * skip support function.
+ */
+bool
+PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse,
+ SkipSupport sksup)
+{
+ Oid skipSupportFunction;
+
+ /* Look for a skip support function */
+ skipSupportFunction = get_opfamily_proc(opfamily, opcintype, opcintype,
+ BTSKIPSUPPORT_PROC);
+ if (!OidIsValid(skipSupportFunction))
+ return false;
+
+ OidFunctionCall1(skipSupportFunction, PointerGetDatum(sksup));
+
+ if (reverse)
+ {
+ Datum low_elem = sksup->low_elem;
+
+ sksup->low_elem = sksup->high_elem;
+ sksup->high_elem = low_elem;
+ }
+
+ return true;
+}
diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index 45eb1b2fe..a9222f896 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -13,12 +13,15 @@
#include "postgres.h"
+#include <limits.h>
+
#include "common/hashfn.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "port/pg_bswap.h"
#include "utils/fmgrprotos.h"
#include "utils/guc.h"
+#include "utils/skipsupport.h"
#include "utils/sortsupport.h"
#include "utils/timestamp.h"
#include "utils/uuid.h"
@@ -390,6 +393,68 @@ uuid_abbrev_convert(Datum original, SortSupport ssup)
return res;
}
+static Datum
+uuid_decrement(Relation rel, Datum existing)
+{
+ pg_uuid_t *uuid;
+
+ uuid = (pg_uuid_t *) palloc(UUID_LEN);
+ memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN);
+ for (int i = UUID_LEN - 1; i >= 0; i--)
+ {
+ if (uuid->data[i] > 0)
+ {
+ uuid->data[i]--;
+ return UUIDPGetDatum(uuid);
+ }
+ uuid->data[i] = UCHAR_MAX;
+ }
+
+ Assert(false);
+
+ return UUIDPGetDatum(uuid);
+}
+
+static Datum
+uuid_increment(Relation rel, Datum existing)
+{
+ pg_uuid_t *uuid;
+
+ uuid = (pg_uuid_t *) palloc(UUID_LEN);
+ memcpy(uuid, DatumGetUUIDP(existing), UUID_LEN);
+ for (int i = UUID_LEN - 1; i >= 0; i--)
+ {
+ if (uuid->data[i] < UCHAR_MAX)
+ {
+ uuid->data[i]++;
+ return UUIDPGetDatum(uuid);
+ }
+ uuid->data[i] = 0;
+ }
+
+ Assert(false);
+
+ return UUIDPGetDatum(uuid);
+}
+
+Datum
+uuid_skipsupport(PG_FUNCTION_ARGS)
+{
+ SkipSupport sksup = (SkipSupport) PG_GETARG_POINTER(0);
+ pg_uuid_t *uuid_min = palloc(UUID_LEN);
+ pg_uuid_t *uuid_max = palloc(UUID_LEN);
+
+ memset(uuid_min->data, 0x00, UUID_LEN);
+ memset(uuid_max->data, 0xFF, UUID_LEN);
+
+ sksup->decrement = uuid_decrement;
+ sksup->increment = uuid_increment;
+ sksup->low_elem = UUIDPGetDatum(uuid_min);
+ sksup->high_elem = UUIDPGetDatum(uuid_max);
+
+ PG_RETURN_VOID();
+}
+
/* hash index support */
Datum
uuid_hash(PG_FUNCTION_ARGS)
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 6f4188599..8ec3f150a 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -28,6 +28,7 @@
#include "access/commit_ts.h"
#include "access/gin.h"
+#include "access/nbtree.h"
#include "access/slru.h"
#include "access/toast_compression.h"
#include "access/twophase.h"
@@ -3523,6 +3524,17 @@ struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ /* XXX Remove before commit */
+ {
+ {"skipscan_prefix_cols", PGC_SUSET, DEVELOPER_OPTIONS,
+ NULL, NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &skipscan_prefix_cols,
+ INDEX_MAX_KEYS, 0, INDEX_MAX_KEYS,
+ NULL, NULL, NULL
+ },
+
{
/* Can't be set in postgresql.conf */
{"server_version_num", PGC_INTERNAL, PRESET_OPTIONS,
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 2b3997988..9662fb2ba 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -583,6 +583,19 @@ options(<replaceable>relopts</replaceable> <type>local_relopts *</type>) returns
</para>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><function>skipsupport</function></term>
+ <listitem>
+ <para>
+ Optionally, a btree operator family may provide a <firstterm>skip
+ support</firstterm> function, registered under support function
+ number 6. These functions allow the B-tree code to more efficiently
+ navigate the index structure via an index <quote>skip scan</quote>. The
+ APIs involved in this are defined in
+ <filename>src/include/utils/skipsupport.h</filename>.
+ </para>
+ </listitem>
+ </varlistentry>
</variablelist>
</sect2>
diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml
index 22d8ad1aa..f17dd3456 100644
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -461,6 +461,13 @@
</entry>
<entry>5</entry>
</row>
+ <row>
+ <entry>
+ Return the addresses of C-callable skip support function(s)
+ (optional)
+ </entry>
+ <entry>6</entry>
+ </row>
</tbody>
</tgroup>
</table>
@@ -1056,7 +1063,8 @@ DEFAULT FOR TYPE int8 USING btree FAMILY integer_ops AS
FUNCTION 1 btint8cmp(int8, int8) ,
FUNCTION 2 btint8sortsupport(internal) ,
FUNCTION 3 in_range(int8, int8, int8, boolean, boolean) ,
- FUNCTION 4 btequalimage(oid) ;
+ FUNCTION 4 btequalimage(oid) ,
+ FUNCTION 6 btint8skipsupport(internal);
CREATE OPERATOR CLASS int4_ops
DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS
@@ -1069,7 +1077,8 @@ DEFAULT FOR TYPE int4 USING btree FAMILY integer_ops AS
FUNCTION 1 btint4cmp(int4, int4) ,
FUNCTION 2 btint4sortsupport(internal) ,
FUNCTION 3 in_range(int4, int4, int4, boolean, boolean) ,
- FUNCTION 4 btequalimage(oid) ;
+ FUNCTION 4 btequalimage(oid) ,
+ FUNCTION 6 btint4skipsupport(internal);
CREATE OPERATOR CLASS int2_ops
DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS
@@ -1082,7 +1091,8 @@ DEFAULT FOR TYPE int2 USING btree FAMILY integer_ops AS
FUNCTION 1 btint2cmp(int2, int2) ,
FUNCTION 2 btint2sortsupport(internal) ,
FUNCTION 3 in_range(int2, int2, int2, boolean, boolean) ,
- FUNCTION 4 btequalimage(oid) ;
+ FUNCTION 4 btequalimage(oid) ,
+ FUNCTION 6 btint2skipsupport(internal);
ALTER OPERATOR FAMILY integer_ops USING btree ADD
-- cross-type comparisons int8 vs int2
diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
index ae54cb254..8b6b775c1 100644
--- a/src/test/regress/expected/alter_generic.out
+++ b/src/test/regress/expected/alter_generic.out
@@ -362,9 +362,9 @@ ERROR: invalid operator number 0, must be between 1 and 5
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types
ERROR: operator argument types must be specified in ALTER OPERATOR FAMILY
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function
-ERROR: invalid function number 0, must be between 1 and 5
-ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5
-ERROR: invalid function number 6, must be between 1 and 5
+ERROR: invalid function number 0, must be between 1 and 6
+ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6
+ERROR: invalid function number 7, must be between 1 and 6
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY
ERROR: STORAGE cannot be specified in ALTER OPERATOR FAMILY
DROP OPERATOR FAMILY alt_opf4 USING btree;
diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
index 3bbe4c5f9..a8d5be6c1 100644
--- a/src/test/regress/expected/psql.out
+++ b/src/test/regress/expected/psql.out
@@ -5138,9 +5138,10 @@ List of access methods
btree | uuid_ops | uuid | uuid | 1 | uuid_cmp
btree | uuid_ops | uuid | uuid | 2 | uuid_sortsupport
btree | uuid_ops | uuid | uuid | 4 | btequalimage
+ btree | uuid_ops | uuid | uuid | 6 | uuid_skipsupport
hash | uuid_ops | uuid | uuid | 1 | uuid_hash
hash | uuid_ops | uuid | uuid | 2 | uuid_hash_extended
-(5 rows)
+(6 rows)
-- check \dconfig
set work_mem = 10240;
diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql
index de58d268d..4246afefd 100644
--- a/src/test/regress/sql/alter_generic.sql
+++ b/src/test/regress/sql/alter_generic.sql
@@ -310,7 +310,7 @@ ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 6 < (int4, int2); -- ope
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 0 < (int4, int2); -- operator number should be between 1 and 5
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD OPERATOR 1 < ; -- operator without argument types
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 0 btint42cmp(int4, int2); -- invalid options parsing function
-ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 6 btint42cmp(int4, int2); -- function number should be between 1 and 5
+ALTER OPERATOR FAMILY alt_opf4 USING btree ADD FUNCTION 7 btint42cmp(int4, int2); -- function number should be between 1 and 6
ALTER OPERATOR FAMILY alt_opf4 USING btree ADD STORAGE invalid_storage; -- Ensure STORAGE is not a part of ALTER OPERATOR FAMILY
DROP OPERATOR FAMILY alt_opf4 USING btree;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e6c1caf64..369eca3a4 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -218,6 +218,7 @@ BTScanPos
BTScanPosData
BTScanPosItem
BTShared
+BTSkipPreproc
BTSortArrayContext
BTSpool
BTStack
@@ -2650,6 +2651,8 @@ SingleBoundSortItem
SinglePartitionSpec
Size
SkipPages
+SkipSupport
+SkipSupportData
SlabBlock
SlabContext
SlabSlot
--
2.45.2