v6-0002-Normalize-nbtree-truncated-high-key-array-behavio.patch

application/octet-stream

Filename: v6-0002-Normalize-nbtree-truncated-high-key-array-behavio.patch
Type: application/octet-stream
Part: 2
Message: Re: Adding skip scan (including MDAM style range skip scan) to nbtree

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v6-0002
Subject: Normalize nbtree truncated high key array behavior.
File+
src/backend/access/nbtree/nbtree.c 4 0
src/backend/access/nbtree/nbtsearch.c 22 0
src/backend/access/nbtree/nbtutils.c 66 53
src/include/access/nbtree.h 3 0
From 44e64c24e6ba073a2f97cef15ae49281c2046e86 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 8 Aug 2024 13:51:18 -0400
Subject: [PATCH v6 2/4] Normalize nbtree truncated high key array behavior.

Commit 5bf748b8 taught nbtree ScalarArrayOp array processing to decide
when and how to start the next primitive index scan based on physical
index characteristics.  This included rules for deciding whether to
start a new primitive index scan (or whether to move onto the right
sibling leaf page instead) whenever the scan encounters a leaf high key
with truncated lower-order columns whose omitted/-inf values are covered
by one or more arrays.

Prior to this commit, nbtree would treat a truncated column as
satisfying a scan key that marked required in the current scan
direction.  It would just give up and start a new primitive index scan
in cases involving inequalities required in the opposite direction only
(in practice this meant > and >= strategy scan keys, since only forward
scans consider the page high key like this).

Bring > and >= strategy scan keys in line with other required scan key
types: have nbtree persist with its current primitive index scan
regardless of the operator strategy in use.  This requires scheduling
and then performing an explicit check of the next page's high key (if
any) at the point that _bt_readpage is next called.

Although this could be considered a stand alone piece of work, it's
mostly intended as preparation for an upcoming patch that adds skip scan
optimizations to nbtree.  Without this work there are cases where the
scan's skip arrays trigger an excessive number of primitive index scans
due to most high keys having a truncated attribute that was previously
treated as not satisfying a required > or >= strategy scan key.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wz=9A_UtM7HzUThSkQ+BcrQsQZuNhWOvQWK06PRkEp=SKQ@mail.gmail.com
---
 src/include/access/nbtree.h           |   3 +
 src/backend/access/nbtree/nbtree.c    |   4 +
 src/backend/access/nbtree/nbtsearch.c |  22 +++++
 src/backend/access/nbtree/nbtutils.c  | 119 ++++++++++++++------------
 4 files changed, 95 insertions(+), 53 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9af9b3ecd..f578cdb73 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1048,6 +1048,7 @@ typedef struct BTScanOpaqueData
 	int			numArrayKeys;	/* number of equality-type array keys */
 	bool		needPrimScan;	/* New prim scan to continue in current dir? */
 	bool		scanBehind;		/* Last array advancement matched -inf attr? */
+	bool		oppoDirCheck;	/* check opposite dir scan keys? */
 	BTArrayKeyInfo *arrayKeys;	/* info about each equality-type array key */
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
@@ -1288,6 +1289,8 @@ extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 						  IndexTuple tuple, int tupnatts);
+extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
+								  IndexTuple finaltup);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index dfef6c12d..e055571c8 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -332,6 +332,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	so->needPrimScan = false;
 	so->scanBehind = false;
+	so->oppoDirCheck = false;
 	so->arrayKeys = NULL;
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
@@ -375,6 +376,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	so->markItemIndex = -1;
 	so->needPrimScan = false;
 	so->scanBehind = false;
+	so->oppoDirCheck = false;
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
@@ -629,6 +631,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 		 */
 		so->needPrimScan = false;
 		so->scanBehind = false;
+		so->oppoDirCheck = false;
 	}
 	else
 	{
@@ -673,6 +676,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 				}
 				so->needPrimScan = true;
 				so->scanBehind = false;
+				so->oppoDirCheck = false;
 				*pageno = InvalidBlockNumber;
 				exit_loop = true;
 			}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 4b91a192e..e5f941e0a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1704,6 +1704,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 			ItemId		iid = PageGetItemId(page, P_HIKEY);
 
 			pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+			if (unlikely(so->oppoDirCheck))
+			{
+				/*
+				 * Last _bt_readpage call scheduled precheck of finaltup for
+				 * required scan keys up to and including a > or >= scan key
+				 * (necessary because > and >= are only generally considered
+				 * required when scanning backwards)
+				 */
+				Assert(so->scanBehind);
+				so->oppoDirCheck = false;
+				if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
+				{
+					/*
+					 * Back out of continuing with this leaf page -- schedule
+					 * another primitive index scan after all
+					 */
+					so->currPos.moreRight = false;
+					so->needPrimScan = true;
+					return false;
+				}
+			}
 		}
 
 		/* load items[] in ascending order */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index c22ccec78..98688a3d6 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1362,7 +1362,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
 			curArrayKey->cur_elem = 0;
 		skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
 	}
-	so->scanBehind = false;
+	so->scanBehind = so->oppoDirCheck = false;	/* reset */
 }
 
 /*
@@ -1671,8 +1671,7 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
 
 	Assert(so->numArrayKeys);
 
-	/* scanBehind flag doesn't persist across primitive index scans - reset */
-	so->scanBehind = false;
+	so->scanBehind = so->oppoDirCheck = false;	/* reset */
 
 	/*
 	 * Array keys are advanced within _bt_checkkeys when the scan reaches the
@@ -1808,7 +1807,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 		Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
 											 tupnatts, false, 0, NULL));
 
-		so->scanBehind = false; /* reset */
+		so->scanBehind = so->oppoDirCheck = false;	/* reset */
 
 		/*
 		 * Required scan key wasn't satisfied, so required arrays will have to
@@ -2293,19 +2292,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	if (so->scanBehind && has_required_opposite_direction_only)
 	{
 		/*
-		 * However, we avoid this behavior whenever the scan involves a scan
+		 * However, we do things differently whenever the scan involves a scan
 		 * key required in the opposite direction to the scan only, along with
 		 * a finaltup with at least one truncated attribute that's associated
 		 * with a scan key marked required (required in either direction).
 		 *
 		 * _bt_check_compare simply won't stop the scan for a scan key that's
 		 * marked required in the opposite scan direction only.  That leaves
-		 * us without any reliable way of reconsidering any opposite-direction
+		 * us without an automatic way of reconsidering any opposite-direction
 		 * inequalities if it turns out that starting a new primitive index
 		 * scan will allow _bt_first to skip ahead by a great many leaf pages
 		 * (see next section for details of how that works).
+		 *
+		 * We deal with this by explicitly scheduling a finaltup recheck for
+		 * the next page -- we'll call _bt_oppodir_checkkeys for the next
+		 * page's finaltup instead.  You can think of this as a way of dealing
+		 * with this page's finaltup being truncated by checking the next
+		 * page's finaltup instead.  And you can think of the oppoDirCheck
+		 * recheck handling within _bt_readpage as complementing the similar
+		 * scanBehind recheck made from within _bt_checkkeys.
 		 */
-		goto new_prim_scan;
+		so->oppoDirCheck = true;	/* schedule next page's finaltup recheck */
 	}
 
 	/*
@@ -2343,54 +2350,16 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 * also before the _bt_first-wise start of tuples for our new qual.  That
 	 * at least suggests many more skippable pages beyond the current page.
 	 */
-	if (has_required_opposite_direction_only && pstate->finaltup &&
-		(all_required_satisfied || oppodir_inequality_sktrig))
+	else if (has_required_opposite_direction_only && pstate->finaltup &&
+			 (all_required_satisfied || oppodir_inequality_sktrig) &&
+			 unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
 	{
-		int			nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
-		ScanDirection flipped;
-		bool		continuescanflip;
-		int			opsktrig;
-
 		/*
-		 * We're checking finaltup (which is usually not caller's tuple), so
-		 * cannot reuse work from caller's earlier _bt_check_compare call.
-		 *
-		 * Flip the scan direction when calling _bt_check_compare this time,
-		 * so that it will set continuescanflip=false when it encounters an
-		 * inequality required in the opposite scan direction.
+		 * Make sure that any non-required arrays are set to the first array
+		 * element for the current scan direction
 		 */
-		Assert(!so->scanBehind);
-		opsktrig = 0;
-		flipped = -dir;
-		_bt_check_compare(scan, flipped,
-						  pstate->finaltup, nfinaltupatts, tupdesc,
-						  false, false, false,
-						  &continuescanflip, &opsktrig);
-
-		/*
-		 * Only start a new primitive index scan when finaltup has a required
-		 * unsatisfied inequality (unsatisfied in the opposite direction)
-		 */
-		Assert(all_required_satisfied != oppodir_inequality_sktrig);
-		if (unlikely(!continuescanflip &&
-					 so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber))
-		{
-			/*
-			 * It's possible for the same inequality to be unsatisfied by both
-			 * caller's tuple (in scan's direction) and finaltup (in the
-			 * opposite direction) due to _bt_check_compare's behavior with
-			 * NULLs
-			 */
-			Assert(opsktrig >= sktrig); /* not opsktrig > sktrig due to NULLs */
-
-			/*
-			 * Make sure that any non-required arrays are set to the first
-			 * array element for the current scan direction
-			 */
-			_bt_rewind_nonrequired_arrays(scan, dir);
-
-			goto new_prim_scan;
-		}
+		_bt_rewind_nonrequired_arrays(scan, dir);
+		goto new_prim_scan;
 	}
 
 	/*
@@ -3522,7 +3491,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 		 *
 		 * Assert that the scan isn't in danger of becoming confused.
 		 */
-		Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
+		Assert(!so->scanBehind && !so->oppoDirCheck);
+		Assert(!pstate->prechecked && !pstate->firstmatch);
 		Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
 											 tupnatts, false, 0, NULL));
 	}
@@ -3634,6 +3604,49 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 								  ikey, true);
 }
 
+/*
+ * Test whether an indextuple satisfies inequalities required in the opposite
+ * direction only (and lower-order equalities required in either direction).
+ *
+ * scan: index scan descriptor (containing a search-type scankey)
+ * dir: current scan direction (flipped by us to get opposite direction)
+ * finaltup: final index tuple on the page
+ *
+ * Caller's finaltup tuple is the page high key (for forwards scans), or the
+ * first non-pivot tuple (for backwards scans).  Caller during scans with
+ * required array keys.
+ *
+ * Return true if finatup satisfies keys, false if not.  If the tuple fails to
+ * pass the qual, then caller is should start another primitive index scan;
+ * _bt_first can efficiently relocate the scan to a far later leaf page.
+ *
+ * Note: we focus on required-in-opposite-direction scan keys (e.g. for a
+ * required > or >= key, assuming a forwards scan) because _bt_checkkeys() can
+ * always deal with required-in-current-direction scan keys on its own.
+ */
+bool
+_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
+					  IndexTuple finaltup)
+{
+	Relation	rel = scan->indexRelation;
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
+	bool		continuescan;
+	ScanDirection flipped = -dir;
+	int			ikey = 0;
+
+	Assert(so->numArrayKeys);
+
+	_bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
+					  false, false, false, &continuescan, &ikey);
+
+	if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
+		return false;
+
+	return true;
+}
+
 /*
  * Test whether an indextuple satisfies current scan condition.
  *
-- 
2.45.2