v9-0002-Normalize-nbtree-truncated-high-key-array-behavio.patch

application/octet-stream

Filename: v9-0002-Normalize-nbtree-truncated-high-key-array-behavio.patch
Type: application/octet-stream
Part: 2
Message: Re: Adding skip scan (including MDAM style range skip scan) to nbtree

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v9-0002
Subject: Normalize nbtree truncated high key array behavior.
File+
src/backend/access/nbtree/nbtree.c 4 0
src/backend/access/nbtree/nbtsearch.c 22 0
src/backend/access/nbtree/nbtutils.c 66 53
src/include/access/nbtree.h 3 0
From e493af4514512a5d04b344d13aac52494facb87a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 8 Aug 2024 13:51:18 -0400
Subject: [PATCH v9 2/3] Normalize nbtree truncated high key array behavior.

Commit 5bf748b8 taught nbtree ScalarArrayOp array processing to decide
when and how to start the next primitive index scan based on physical
index characteristics.  This included rules for deciding whether to
start a new primitive index scan (or whether to move onto the right
sibling leaf page instead) whenever the scan encounters a leaf high key
with truncated lower-order columns whose omitted/-inf values are covered
by one or more arrays.

Prior to this commit, nbtree only treated a truncated key column as
satisfying scan keys that were marked required in the scan's direction.
It would just give up and start a new primitive index scan in cases
involving inequalities marked required in the opposite direction only
(in practice this meant > and >= strategy scan keys, since only forward
scans consider the page high key like this).

Bring > and >= strategy scan keys in line with other required scan key
types: have nbtree persist with its current primitive index scan
regardless of the operator strategy in use.  This requires scheduling
and then performing an explicit check of the next page's high key (if
any) at the point that _bt_readpage is next called.

Although this could be considered a stand alone piece of work, it was
written in preparation for an upcoming patch to add skip scan to nbtree.
Without this enhancement, skip scan would create cases where the scan's
"skip arrays" trigger an excessive number of primitive index scans.  In
principle the underlying problem exists independently of whether skip
arrays or conventional SAOP arrays are used.  In practice skip arrays
tend to make array advancement a lot more sensitive to issues in this
area, so this work is more or less a prerequisite for skip scan.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAH2-Wz=9A_UtM7HzUThSkQ+BcrQsQZuNhWOvQWK06PRkEp=SKQ@mail.gmail.com
---
 src/include/access/nbtree.h           |   3 +
 src/backend/access/nbtree/nbtree.c    |   4 +
 src/backend/access/nbtree/nbtsearch.c |  22 +++++
 src/backend/access/nbtree/nbtutils.c  | 119 ++++++++++++++------------
 4 files changed, 95 insertions(+), 53 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d64300fb9..38b600945 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1048,6 +1048,7 @@ typedef struct BTScanOpaqueData
 	int			numArrayKeys;	/* number of equality-type array keys */
 	bool		needPrimScan;	/* New prim scan to continue in current dir? */
 	bool		scanBehind;		/* Last array advancement matched -inf attr? */
+	bool		oppositeDirCheck;	/* check opposite dir scan keys? */
 	BTArrayKeyInfo *arrayKeys;	/* info about each equality-type array key */
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
@@ -1289,6 +1290,8 @@ extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
 extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 						  IndexTuple tuple, int tupnatts);
+extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
+								  IndexTuple finaltup);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
 extern BTCycleId _bt_start_vacuum(Relation rel);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4febe6bdc..ddc6e1f7a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -333,6 +333,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	so->needPrimScan = false;
 	so->scanBehind = false;
+	so->oppositeDirCheck = false;
 	so->arrayKeys = NULL;
 	so->orderProcs = NULL;
 	so->arrayContext = NULL;
@@ -376,6 +377,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	so->markItemIndex = -1;
 	so->needPrimScan = false;
 	so->scanBehind = false;
+	so->oppositeDirCheck = false;
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
@@ -621,6 +623,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 		 */
 		so->needPrimScan = false;
 		so->scanBehind = false;
+		so->oppositeDirCheck = false;
 	}
 	else
 	{
@@ -679,6 +682,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			 */
 			so->needPrimScan = true;
 			so->scanBehind = false;
+			so->oppositeDirCheck = false;
 		}
 		else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
 		{
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index b11112539..d49e0fb70 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1704,6 +1704,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 			ItemId		iid = PageGetItemId(page, P_HIKEY);
 
 			pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+			if (unlikely(so->oppositeDirCheck))
+			{
+				/*
+				 * Last _bt_readpage call scheduled precheck of finaltup for
+				 * required scan keys up to and including a > or >= scan key
+				 * (necessary because > and >= are only generally considered
+				 * required when scanning backwards)
+				 */
+				Assert(so->scanBehind);
+				so->oppositeDirCheck = false;
+				if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
+				{
+					/*
+					 * Back out of continuing with this leaf page -- schedule
+					 * another primitive index scan after all
+					 */
+					so->currPos.moreRight = false;
+					so->needPrimScan = true;
+					return false;
+				}
+			}
 		}
 
 		/* load items[] in ascending order */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index b4ba51357..75608034d 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1371,7 +1371,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
 			curArrayKey->cur_elem = 0;
 		skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
 	}
-	so->scanBehind = false;
+	so->scanBehind = so->oppositeDirCheck = false;	/* reset */
 }
 
 /*
@@ -1680,8 +1680,7 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
 
 	Assert(so->numArrayKeys);
 
-	/* scanBehind flag doesn't persist across primitive index scans - reset */
-	so->scanBehind = false;
+	so->scanBehind = so->oppositeDirCheck = false;	/* reset */
 
 	/*
 	 * Array keys are advanced within _bt_checkkeys when the scan reaches the
@@ -1817,7 +1816,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 		Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
 											 tupnatts, false, 0, NULL));
 
-		so->scanBehind = false; /* reset */
+		so->scanBehind = so->oppositeDirCheck = false;	/* reset */
 
 		/*
 		 * Required scan key wasn't satisfied, so required arrays will have to
@@ -2302,19 +2301,27 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	if (so->scanBehind && has_required_opposite_direction_only)
 	{
 		/*
-		 * However, we avoid this behavior whenever the scan involves a scan
+		 * However, we do things differently whenever the scan involves a scan
 		 * key required in the opposite direction to the scan only, along with
 		 * a finaltup with at least one truncated attribute that's associated
 		 * with a scan key marked required (required in either direction).
 		 *
 		 * _bt_check_compare simply won't stop the scan for a scan key that's
 		 * marked required in the opposite scan direction only.  That leaves
-		 * us without any reliable way of reconsidering any opposite-direction
+		 * us without an automatic way of reconsidering any opposite-direction
 		 * inequalities if it turns out that starting a new primitive index
 		 * scan will allow _bt_first to skip ahead by a great many leaf pages
 		 * (see next section for details of how that works).
+		 *
+		 * We deal with this by explicitly scheduling a finaltup recheck for
+		 * the next page -- we'll call _bt_oppodir_checkkeys for the next
+		 * page's finaltup instead.  You can think of this as a way of dealing
+		 * with this page's finaltup being truncated by checking the next
+		 * page's finaltup instead.  And you can think of the oppositeDirCheck
+		 * recheck handling within _bt_readpage as complementing the similar
+		 * scanBehind recheck made from within _bt_checkkeys.
 		 */
-		goto new_prim_scan;
+		so->oppositeDirCheck = true;	/* schedule next page's finaltup recheck */
 	}
 
 	/*
@@ -2352,54 +2359,16 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 * also before the _bt_first-wise start of tuples for our new qual.  That
 	 * at least suggests many more skippable pages beyond the current page.
 	 */
-	if (has_required_opposite_direction_only && pstate->finaltup &&
-		(all_required_satisfied || oppodir_inequality_sktrig))
+	else if (has_required_opposite_direction_only && pstate->finaltup &&
+			 (all_required_satisfied || oppodir_inequality_sktrig) &&
+			 unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
 	{
-		int			nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel);
-		ScanDirection flipped;
-		bool		continuescanflip;
-		int			opsktrig;
-
 		/*
-		 * We're checking finaltup (which is usually not caller's tuple), so
-		 * cannot reuse work from caller's earlier _bt_check_compare call.
-		 *
-		 * Flip the scan direction when calling _bt_check_compare this time,
-		 * so that it will set continuescanflip=false when it encounters an
-		 * inequality required in the opposite scan direction.
+		 * Make sure that any non-required arrays are set to the first array
+		 * element for the current scan direction
 		 */
-		Assert(!so->scanBehind);
-		opsktrig = 0;
-		flipped = -dir;
-		_bt_check_compare(scan, flipped,
-						  pstate->finaltup, nfinaltupatts, tupdesc,
-						  false, false, false,
-						  &continuescanflip, &opsktrig);
-
-		/*
-		 * Only start a new primitive index scan when finaltup has a required
-		 * unsatisfied inequality (unsatisfied in the opposite direction)
-		 */
-		Assert(all_required_satisfied != oppodir_inequality_sktrig);
-		if (unlikely(!continuescanflip &&
-					 so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber))
-		{
-			/*
-			 * It's possible for the same inequality to be unsatisfied by both
-			 * caller's tuple (in scan's direction) and finaltup (in the
-			 * opposite direction) due to _bt_check_compare's behavior with
-			 * NULLs
-			 */
-			Assert(opsktrig >= sktrig); /* not opsktrig > sktrig due to NULLs */
-
-			/*
-			 * Make sure that any non-required arrays are set to the first
-			 * array element for the current scan direction
-			 */
-			_bt_rewind_nonrequired_arrays(scan, dir);
-
-			goto new_prim_scan;
-		}
+		_bt_rewind_nonrequired_arrays(scan, dir);
+		goto new_prim_scan;
 	}
 
 	/*
@@ -3511,7 +3480,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 		 *
 		 * Assert that the scan isn't in danger of becoming confused.
 		 */
-		Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
+		Assert(!so->scanBehind && !so->oppositeDirCheck);
+		Assert(!pstate->prechecked && !pstate->firstmatch);
 		Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
 											 tupnatts, false, 0, NULL));
 	}
@@ -3623,6 +3593,49 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
 								  ikey, true);
 }
 
+/*
+ * Test whether an indextuple satisfies inequalities required in the opposite
+ * direction only (and lower-order equalities required in either direction).
+ *
+ * scan: index scan descriptor (containing a search-type scankey)
+ * dir: current scan direction (flipped by us to get opposite direction)
+ * finaltup: final index tuple on the page
+ *
+ * Caller's finaltup tuple is the page high key (for forwards scans), or the
+ * first non-pivot tuple (for backwards scans).  Called during scans with
+ * required array keys.
+ *
+ * Return true if finatup satisfies keys, false if not.  If the tuple fails to
+ * pass the qual, then caller should start another primitive index scan;
+ * _bt_first can efficiently relocate the scan to a far later leaf page.
+ *
+ * Note: we focus on required-in-opposite-direction scan keys (e.g. for a
+ * required > or >= key, assuming a forwards scan) because _bt_checkkeys() can
+ * always deal with required-in-current-direction scan keys on its own.
+ */
+bool
+_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
+					  IndexTuple finaltup)
+{
+	Relation	rel = scan->indexRelation;
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	int			nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
+	bool		continuescan;
+	ScanDirection flipped = -dir;
+	int			ikey = 0;
+
+	Assert(so->numArrayKeys);
+
+	_bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
+					  false, false, false, &continuescan, &ikey);
+
+	if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
+		return false;
+
+	return true;
+}
+
 /*
  * Test whether an indextuple satisfies current scan condition.
  *
-- 
2.45.2