v29-0004-Apply-low-order-skip-key-in-_bt_first-more-often.patch

application/octet-stream

Filename: v29-0004-Apply-low-order-skip-key-in-_bt_first-more-often.patch
Type: application/octet-stream
Part: 1
Message: Re: Adding skip scan (including MDAM style range skip scan) to nbtree

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v29-0004
Subject: Apply low-order skip key in _bt_first more often.
File+
src/backend/access/nbtree/nbtpreprocesskeys.c 181 0
src/test/regress/expected/create_index.out 21 0
src/test/regress/sql/create_index.sql 10 0
From 1c83fbe2fad46ab73603e8f704b5c6f1a4c3fc12 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 13 Jan 2025 16:08:32 -0500
Subject: [PATCH v29 4/5] Apply low-order skip key in _bt_first more often.

Convert low_compare and high_compare nbtree skip array inequalities
(with opclasses that offer skip support) such that _bt_first is
consistently able to use later keys when descending the tree within
_bt_first.

For example, an index qual "WHERE a > 5 AND b = 2" is now converted to
"WHERE a >= 6 AND b = 2" by a new preprocessing step that takes place
after a final low_compare and/or high_compare are chosen by all earlier
preprocessing steps.  That way the scan's initial call to _bt_first will
use "WHERE a >= 6 AND b = 2" to find the initial leaf level position,
rather than merely using "WHERE a > 5" -- "b = 2" can always be applied.
This has a decent chance of making the scan avoid an extra _bt_first
call that would otherwise be needed just to determine the lowest-sorting
"a" value in the index (the lowest that still satisfies "WHERE a > 5").

The transformation process can only lower the total number of index
pages read when the use of a more restrictive set of initial positioning
keys in _bt_first actually allows the scan to land on some later leaf
page directly, relative to the unoptimized case (or on an earlier leaf
page directly, when scanning backwards).  The savings can be far greater
when affected skip arrays come after some higher-order array.  For
example, a qual "WHERE x IN (1, 2, 3) AND y > 5 AND z = 2" can now save
as many as 3 _bt_first calls as a result of these transformations (there
can be as many as 1 _bt_first call saved per "x" array element).

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wz=FJ78K3WsF3iWNxWnUCY9f=Jdg3QPxaXE=uYUbmuRz5Q@mail.gmail.com
---
 src/backend/access/nbtree/nbtpreprocesskeys.c | 181 ++++++++++++++++++
 src/test/regress/expected/create_index.out    |  21 ++
 src/test/regress/sql/create_index.sql         |  10 +
 3 files changed, 212 insertions(+)

diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c
index 04792eab8..999bb4578 100644
--- a/src/backend/access/nbtree/nbtpreprocesskeys.c
+++ b/src/backend/access/nbtree/nbtpreprocesskeys.c
@@ -50,6 +50,12 @@ static bool _bt_saoparray_shrink(IndexScanDesc scan, ScanKey arraysk,
 								 BTArrayKeyInfo *array, bool *qual_ok);
 static bool _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey,
 								 BTArrayKeyInfo *array, bool *qual_ok);
+static void _bt_skiparray_strat_adjust(IndexScanDesc scan, ScanKey arraysk,
+									   BTArrayKeyInfo *array);
+static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk,
+										  BTArrayKeyInfo *array);
+static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
+										  BTArrayKeyInfo *array);
 static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys);
 static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
 static int	_bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops,
@@ -1290,6 +1296,172 @@ _bt_skiparray_shrink(IndexScanDesc scan, ScanKey skey, BTArrayKeyInfo *array,
 	return true;
 }
 
+/*
+ * Applies the opfamily's skip support routine to convert the skip array's >
+ * low_compare key (if any) into a >= key, and to convert its < high_compare
+ * key (if any) into a <= key.  Decrements the high_compare key's sk_argument,
+ * and/or increments the low_compare key's sk_argument (also adjusts their
+ * operator strategies, while changing the operator as appropriate).
+ *
+ * This optional optimization reduces the number of descents required within
+ * _bt_first.  Whenever _bt_first is called with a skip array whose current
+ * array element is the sentinel value MINVAL, using a transformed >= key
+ * instead of using the original > key makes it safe to include lower-order
+ * scan keys in the insertion scan key (there must be lower-order scan keys
+ * after the skip array).  We will avoid an extra _bt_first to find the first
+ * value in the index > sk_argument -- at least when the first real matching
+ * value in the index happens to be an exact match for the sk_argument value
+ * that we produced here by incrementing the original input key's sk_argument.
+ * (Backwards scans derive the same benefit when they encounter the sentinel
+ * value MAXVAL, by converting the high_compare key from < to <=.)
+ *
+ * Note: The transformation is only correct when it cannot allow the scan to
+ * overlook matching tuples, but we don't have enough semantic information to
+ * safely make sure that can't happen during scans with cross-type operators.
+ * That's why we'll never apply the transformation in cross-type scenarios.
+ * For example, if we attempted to convert "sales_ts > '2024-01-01'::date"
+ * into "sales_ts >= '2024-01-02'::date" given a "sales_ts" attribute whose
+ * input opclass is timestamp_ops, the scan would overlook _all_ tuples for
+ * sales that fell on '2024-01-01'.
+ *
+ * Note: We can safely modify array->low_compare/array->high_compare in place
+ * because they just point to copies of our scan->keyData[] input scan keys
+ * (namely the copies returned by _bt_preprocess_array_keys to be used as
+ * input into the standard preprocessing steps in _bt_preprocess_keys).
+ * Everything will be reset if there's a rescan.
+ */
+static void
+_bt_skiparray_strat_adjust(IndexScanDesc scan, ScanKey arraysk,
+						   BTArrayKeyInfo *array)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	MemoryContext oldContext;
+
+	/*
+	 * Called last among all preprocessing steps, when the skip array's final
+	 * low_compare and high_compare have both been chosen
+	 */
+	Assert(so->skipScan);
+	Assert(arraysk->sk_flags & SK_BT_SKIP);
+	Assert(array->num_elems == -1 && !array->null_elem && array->sksup);
+
+	oldContext = MemoryContextSwitchTo(so->arrayContext);
+
+	if (array->high_compare &&
+		array->high_compare->sk_strategy == BTLessStrategyNumber)
+		_bt_skiparray_strat_decrement(scan, arraysk, array);
+
+	if (array->low_compare &&
+		array->low_compare->sk_strategy == BTGreaterStrategyNumber)
+		_bt_skiparray_strat_increment(scan, arraysk, array);
+
+	MemoryContextSwitchTo(oldContext);
+}
+
+/*
+ * Convert skip array's > low_compare key into a >= key
+ */
+static void
+_bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk,
+							  BTArrayKeyInfo *array)
+{
+	Relation	rel = scan->indexRelation;
+	Oid			opfamily = rel->rd_opfamily[arraysk->sk_attno - 1],
+				opcintype = rel->rd_opcintype[arraysk->sk_attno - 1],
+				leop;
+	RegProcedure cmp_proc;
+	ScanKey		high_compare = array->high_compare;
+	Datum		orig_sk_argument = high_compare->sk_argument,
+				new_sk_argument;
+	bool		uflow;
+
+	Assert(high_compare->sk_strategy == BTLessStrategyNumber);
+
+	/*
+	 * Only perform the transformation when the operator type matches the
+	 * index attribute's input opclass type
+	 */
+	if (high_compare->sk_subtype != opcintype &&
+		high_compare->sk_subtype != InvalidOid)
+		return;
+
+	/* Decrement, handling underflow by marking the qual unsatisfiable */
+	new_sk_argument = array->sksup->decrement(rel, orig_sk_argument, &uflow);
+	if (uflow)
+	{
+		BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+		so->qual_ok = false;
+		return;
+	}
+
+	/* Look up <= operator (might fail) */
+	leop = get_opfamily_member(opfamily, opcintype, opcintype,
+							   BTLessEqualStrategyNumber);
+	if (!OidIsValid(leop))
+		return;
+	cmp_proc = get_opcode(leop);
+	if (RegProcedureIsValid(cmp_proc))
+	{
+		/* Transform < high_compare key into <= key */
+		fmgr_info(cmp_proc, &high_compare->sk_func);
+		high_compare->sk_argument = new_sk_argument;
+		high_compare->sk_strategy = BTLessEqualStrategyNumber;
+	}
+}
+
+/*
+ * Convert skip array's < low_compare key into a <= key
+ */
+static void
+_bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
+							  BTArrayKeyInfo *array)
+{
+	Relation	rel = scan->indexRelation;
+	Oid			opfamily = rel->rd_opfamily[arraysk->sk_attno - 1],
+				opcintype = rel->rd_opcintype[arraysk->sk_attno - 1],
+				geop;
+	RegProcedure cmp_proc;
+	ScanKey		low_compare = array->low_compare;
+	Datum		orig_sk_argument = low_compare->sk_argument,
+				new_sk_argument;
+	bool		oflow;
+
+	Assert(low_compare->sk_strategy == BTGreaterStrategyNumber);
+
+	/*
+	 * Only perform the transformation when the operator type matches the
+	 * index attribute's input opclass type
+	 */
+	if (low_compare->sk_subtype != opcintype &&
+		low_compare->sk_subtype != InvalidOid)
+		return;
+
+	/* Increment, handling overflow by marking the qual unsatisfiable */
+	new_sk_argument = array->sksup->increment(rel, orig_sk_argument, &oflow);
+	if (oflow)
+	{
+		BTScanOpaque so = (BTScanOpaque) scan->opaque;
+
+		so->qual_ok = false;
+		return;
+	}
+
+	/* Look up >= operator (might fail) */
+	geop = get_opfamily_member(opfamily, opcintype, opcintype,
+							   BTGreaterEqualStrategyNumber);
+	if (!OidIsValid(geop))
+		return;
+	cmp_proc = get_opcode(geop);
+	if (RegProcedureIsValid(cmp_proc))
+	{
+		/* Transform > low_compare key into >= key */
+		fmgr_info(cmp_proc, &low_compare->sk_func);
+		low_compare->sk_argument = new_sk_argument;
+		low_compare->sk_strategy = BTGreaterEqualStrategyNumber;
+	}
+}
+
 /*
  *	_bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
  *
@@ -1825,6 +1997,15 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
 				}
 				else
 				{
+					/*
+					 * Any skip array low_compare and high_compare scan keys
+					 * are now final.  Transform the array's > low_compare key
+					 * into a >= key (and < high_compare keys into a <= key).
+					 */
+					if (array->num_elems == -1 && array->sksup &&
+						!array->null_elem)
+						_bt_skiparray_strat_adjust(scan, outkey, array);
+
 					/* Match found, so done with this array */
 					arrayidx++;
 				}
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 15dde752f..fa4517e2d 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -2589,6 +2589,27 @@ ORDER BY thousand;
         1 |     1001
 (1 row)
 
+-- Skip array preprocessing increments "thousand > -1" to  "thousand >= 0"
+explain (costs off)
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand > -1 AND tenthous IN (1001,3000)
+ORDER BY thousand limit 2;
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ Limit
+   ->  Index Only Scan using tenk1_thous_tenthous on tenk1
+         Index Cond: ((thousand > '-1'::integer) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(3 rows)
+
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand > -1 AND tenthous IN (1001,3000)
+ORDER BY thousand limit 2;
+ thousand | tenthous 
+----------+----------
+        0 |     3000
+        1 |     1001
+(2 rows)
+
 --
 -- Check elimination of constant-NULL subexpressions
 --
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 40ba3d65b..e8c16959a 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -993,6 +993,16 @@ SELECT thousand, tenthous FROM tenk1
 WHERE thousand < 3 and thousand <= 2 AND tenthous = 1001
 ORDER BY thousand;
 
+-- Skip array preprocessing increments "thousand > -1" to  "thousand >= 0"
+explain (costs off)
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand > -1 AND tenthous IN (1001,3000)
+ORDER BY thousand limit 2;
+
+SELECT thousand, tenthous FROM tenk1
+WHERE thousand > -1 AND tenthous IN (1001,3000)
+ORDER BY thousand limit 2;
+
 --
 -- Check elimination of constant-NULL subexpressions
 --
-- 
2.47.2