v16-0003-Fix-regressions-in-unsympathetic-skip-scan-cases.patch

application/octet-stream

Filename: v16-0003-Fix-regressions-in-unsympathetic-skip-scan-cases.patch
Type: application/octet-stream
Part: 0
Message: Re: Adding skip scan (including MDAM style range skip scan) to nbtree

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v16-0003
Subject: Fix regressions in unsympathetic skip scan cases.
File+
src/backend/access/nbtree/nbtsearch.c 5 0
src/backend/access/nbtree/nbtutils.c 70 19
src/include/access/nbtree.h 3 2
From c2e08b1dc180e3684afaa3f0bfd6fead0b401d7b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 16 Nov 2024 15:58:41 -0500
Subject: [PATCH v16 3/3] Fix regressions in unsympathetic skip scan cases.

Fix regressions in cases that are nominally eligible to use skip scan
but can never actually benefit from skipping.  These are cases where the
leading skipped prefix column contains many distinct values -- often as
many distinct values are there are total index tuples.

For example, the test case posted here is fixed by the work from this
commit:

https://postgr.es/m/51d00219180323d121572e1f83ccde2a@oss.nttdata.com

Note that this commit doesn't actually change anything about when or how
skip scan decides when or how to skip.  It just avoids wasting CPU
cycles on uselessly maintaining a skip array at the tuple granularity,
preferring to maintain the skip arrays at something closer to the page
granularity when that makes sense.  See:

https://www.postgresql.org/message-id/flat/CAH2-Wz%3DE7XrkvscBN0U6V81NK3Q-dQOmivvbEsjG-zwEfDdFpg%40mail.gmail.com#7d34e8aa875d7a718043834c5ef4c167

Doing well on cases like this is important because we can't expect the
optimizer to never choose an affected plan -- we prefer to solve these
problems in the executor, which has access to the most reliable and
current information about the index.  The optimizer can afford to be
very optimistic about skipping if actual runtime scan behavior is very
similar to a traditional full index scan in the worst case.  See
"optimizer" section from the original intro mail for more information:

https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP%2BG4bw%40mail.gmail.com

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Discussion: https://postgr.es/m/CAH2-Wz=Y93jf5WjoOsN=xvqpMjRy-bxCE037bVFi-EasrpeUJA@mail.gmail.com
---
 src/include/access/nbtree.h           |  5 +-
 src/backend/access/nbtree/nbtsearch.c |  5 ++
 src/backend/access/nbtree/nbtutils.c  | 89 +++++++++++++++++++++------
 3 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d841e85bc..06ccab8cf 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1112,11 +1112,12 @@ typedef struct BTReadPageState
 	bool		firstmatch;		/* at least one match so far?  */
 
 	/*
-	 * Private _bt_checkkeys state used to manage "look ahead" optimization
-	 * (only used during scans with array keys)
+	 * Private _bt_checkkeys state used to manage "look ahead" and skip array
+	 * optimizations (only used during scans with array keys)
 	 */
 	int16		rechecks;
 	int16		targetdistance;
+	bool		beyondskip;
 
 } BTReadPageState;
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 43e321896..578bafbf4 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1646,6 +1646,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	pstate.firstmatch = false;
 	pstate.rechecks = 0;
 	pstate.targetdistance = 0;
+	pstate.beyondskip = false;
 
 	/*
 	 * Prechecking the value of the continuescan flag for the last item on the
@@ -1837,6 +1838,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 
 			truncatt = BTreeTupleGetNAtts(itup, rel);
 			pstate.prechecked = false;	/* precheck didn't cover HIKEY */
+			pstate.beyondskip = false; /* reset for finaltup */
 			_bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
 		}
 
@@ -1893,6 +1895,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 			else
 				tuple_alive = true;
 
+			if (offnum == minoff)
+				pstate.beyondskip = false; /* reset for finaltup */
+
 			itup = (IndexTuple) PageGetItem(page, iid);
 			Assert(!BTreeTupleIsPivot(itup));
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 9de7c40e8..b70b58e0c 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -151,7 +151,8 @@ static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
 static void _bt_mark_scankey_required(ScanKey skey);
 static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
 							  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
-							  bool advancenonrequired, bool prechecked, bool firstmatch,
+							  bool advancenonrequired, bool beyondskip,
+							  bool prechecked, bool firstmatch,
 							  bool *continuescan, int *ikey);
 static bool _bt_check_rowcompare(ScanKey skey,
 								 IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
@@ -2343,7 +2344,6 @@ _bt_scankey_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
 	Form_pg_attribute attr;
 
 	Assert(skey->sk_flags & SK_SEARCHARRAY);
-	Assert(!(skey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)));
 
 	/* Regular (non-skip) array? */
 	if (array->num_elems != -1)
@@ -2489,7 +2489,6 @@ _bt_scankey_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array)
 	Form_pg_attribute attr;
 
 	Assert(skey->sk_flags & SK_SEARCHARRAY);
-	Assert(!(skey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)));
 
 	/* Regular (non-skip) array? */
 	if (array->num_elems != -1)
@@ -3135,7 +3134,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	ScanDirection dir = so->currPos.dir;
 	int			arrayidx = 0;
 	bool		beyond_end_advance = false,
+				has_skip_array = false,
 				has_required_opposite_direction_only = false,
+				has_required_opposite_direction_skip = false,
 				oppodir_inequality_sktrig = false,
 				all_required_satisfied = true,
 				all_satisfied = true;
@@ -3189,6 +3190,20 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 			{
 				array = &so->arrayKeys[arrayidx++];
 				Assert(array->scan_key == ikey);
+				if (array->num_elems == -1)
+				{
+					has_skip_array = true;
+					if (!array->null_elem)
+					{
+						if (ScanDirectionIsForward(dir) ?
+							array->low_compare : array->high_compare)
+							has_required_opposite_direction_skip = true;
+						if (ScanDirectionIsForward(dir) ?
+							(cur->sk_flags & SK_BT_NULLS_FIRST) :
+							!(cur->sk_flags & SK_BT_NULLS_FIRST))
+							has_required_opposite_direction_skip = true;
+					}
+				}
 			}
 		}
 		else
@@ -3211,8 +3226,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 
 		if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
 		{
-			Assert(sktrig_required);
-
 			required = true;
 
 			if (cur->sk_attno > tupnatts)
@@ -3496,7 +3509,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 * higher-order arrays (might exhaust all the scan's arrays instead, which
 	 * ends the top-level scan).
 	 */
-	if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir))
+	if (beyond_end_advance && sktrig_required &&
+		!_bt_advance_array_keys_increment(scan, dir))
 		goto end_toplevel_scan;
 
 	Assert(_bt_verify_keys_with_arraykeys(scan));
@@ -3528,7 +3542,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 
 		/* Recheck _bt_check_compare on behalf of caller */
 		if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
-							  false, false, false,
+							  false, !sktrig_required, false, false,
 							  &continuescan, &nsktrig) &&
 			!so->scanBehind)
 		{
@@ -3615,9 +3629,20 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 * for _bt_check_compare to behave as if they are required in the current
 	 * scan direction to deal with NULLs.  We'll account for that separately.)
 	 */
-	Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
-										false, 0, NULL) ==
-		   !all_required_satisfied);
+	Assert((_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts,
+										 false, 0, NULL) ==
+			!all_required_satisfied) || pstate->beyondskip);
+
+	/*
+	 * Optimization: if a scan with a skip array required "beyond end of array
+	 * element" array advancement (not necessarily in respect of the skip
+	 * array itself), we assume that the page isn't a particularly good target
+	 * for skip scan, and stop maintaining the scan's skip arrays until we
+	 * reach the page's finaltup, if any.
+	 */
+	if (has_skip_array && beyond_end_advance &&
+		!has_required_opposite_direction_skip && pstate->finaltup)
+		pstate->beyondskip = true;
 
 	/*
 	 * We generally permit primitive index scans to continue onto the next
@@ -3867,7 +3892,8 @@ end_toplevel_scan:
  * make the scan much more efficient: if there are few distinct values in "x",
  * we'll be able to skip over many irrelevant leaf pages.  (If on the other
  * hand there are many distinct values in "x" then the scan will degenerate
- * into a full index scan at run time.)
+ * into a full index scan at run time, but we'll be no worse off overall.
+ * _bt_checkkeys's 'beyondskip' optimization keeps the runtime overhead low.)
  *
  * If possible, redundant keys are eliminated: we keep only the tightest
  * >/>= bound and the tightest </<= bound, and if there's an = key then
@@ -4908,7 +4934,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 	Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
 
 	res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
-							arrayKeys, pstate->prechecked, pstate->firstmatch,
+							arrayKeys, pstate->beyondskip,
+							pstate->prechecked, pstate->firstmatch,
 							&pstate->continuescan, &ikey);
 
 #ifdef USE_ASSERT_CHECKING
@@ -4920,7 +4947,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 		 * Assert that the scan isn't in danger of becoming confused.
 		 */
 		Assert(!so->scanBehind && !so->oppositeDirCheck);
-		Assert(!pstate->prechecked && !pstate->firstmatch);
+		Assert(!pstate->beyondskip && !pstate->prechecked &&
+			   !pstate->firstmatch);
 		Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
 											 tupnatts, false, 0, NULL));
 	}
@@ -4934,7 +4962,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 		 * get the same answer without those optimizations
 		 */
 		Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
-										false, false, false,
+										false, pstate->beyondskip,
+										false, false,
 										&dcontinuescan, &dikey));
 		Assert(pstate->continuescan == dcontinuescan);
 	}
@@ -5065,7 +5094,7 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
 	Assert(so->numArrayKeys);
 
 	_bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
-					  false, false, false, &continuescan, &ikey);
+					  false, false, false, false, &continuescan, &ikey);
 
 	if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
 		return false;
@@ -5110,11 +5139,19 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
  *
  * Pass advancenonrequired=false to avoid all array related side effects.
  * This allows _bt_advance_array_keys caller to avoid infinite recursion.
+ *
+ * Callers with skip arrays can pass beyondskip=true to have us assume that
+ * the scan is still likely to be before the current array keys according to
+ * _bt_tuple_before_array_skeys.  We can safely avoid evaluating skip array
+ * scan keys when this happens.  Note that this makes us treat any required
+ * SAOP arrays as non-required -- skip scan caller is expected to disable this
+ * behavior upon reaching the page's finaltup.
  */
 static bool
 _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
 				  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
-				  bool advancenonrequired, bool prechecked, bool firstmatch,
+				  bool advancenonrequired, bool beyondskip,
+				  bool prechecked, bool firstmatch,
 				  bool *continuescan, int *ikey)
 {
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
@@ -5131,10 +5168,24 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
 
 		/*
 		 * Check if the key is required in the current scan direction, in the
-		 * opposite scan direction _only_, or in neither direction
+		 * opposite scan direction _only_, or in neither direction -- though
+		 * not when "beyond end advancement" skip scan optimization is in use
 		 */
-		if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
-			((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
+		if (beyondskip)
+		{
+			/*
+			 * "Beyond end advancement" skip scan optimization.
+			 *
+			 * Just skip over any skip array scan keys.  Treat all other scan
+			 * keys as not required for the scan to continue.
+			 */
+			Assert(!prechecked);
+
+			if (key->sk_flags & SK_BT_SKIP)
+				continue;
+		}
+		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
+				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
 			requiredSameDir = true;
 		else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) ||
 				 ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir)))
-- 
2.45.2