v24-0002-Improve-nbtree-SAOP-primitive-scan-scheduling.patch
application/x-patch
Filename: v24-0002-Improve-nbtree-SAOP-primitive-scan-scheduling.patch
Type: application/x-patch
Part: 4
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v24-0002
Subject: Improve nbtree SAOP primitive scan scheduling.
| File | + | − |
|---|---|---|
| src/backend/access/nbtree/nbtsearch.c | 27 | 25 |
| src/backend/access/nbtree/nbtutils.c | 104 | 61 |
| src/include/access/nbtree.h | 5 | 4 |
From 427e9e40bacbb3eb5b5c6c2e834395395802fe1d Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 14 Feb 2025 16:11:59 -0500
Subject: [PATCH v24 2/6] Improve nbtree SAOP primitive scan scheduling.
Add new primitive index scan scheduling heuristics that make
_bt_advance_array_keys avoid ending the ongoing primitive index scan
when it has already read more than one leaf page during the ongoing
primscan. This tends to result in better decisions about how to start
and end primitive index scans with queries that have SAOP arrays with
many elements that are clustered together (e.g., contiguous integers).
Preparation for an upcoming patch that will add skip scan optimizations
to nbtree. That'll work by adding skip arrays, which behave similarly
to SAOP arrays with many elements clustered together.
Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wzkz0wPe6+02kr+hC+JJNKfGtjGTzpG3CFVTQmKwWNrXNw@mail.gmail.com
---
src/include/access/nbtree.h | 9 +-
src/backend/access/nbtree/nbtsearch.c | 52 ++++----
src/backend/access/nbtree/nbtutils.c | 165 ++++++++++++++++----------
3 files changed, 136 insertions(+), 90 deletions(-)
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 000c7289b..599e6fa4c 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1043,8 +1043,8 @@ typedef struct BTScanOpaqueData
/* workspace for SK_SEARCHARRAY support */
int numArrayKeys; /* number of equality-type array keys */
bool needPrimScan; /* New prim scan to continue in current dir? */
- bool scanBehind; /* Last array advancement matched -inf attr? */
- bool oppositeDirCheck; /* explicit scanBehind recheck needed? */
+ bool scanBehind; /* arrays might be ahead of scan's key space? */
+ bool oppositeDirCheck; /* scanBehind opposite-scan-dir check? */
BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
FmgrInfo *orderProcs; /* ORDER procs for required equality keys */
MemoryContext arrayContext; /* scan-lifespan context for array data */
@@ -1099,6 +1099,7 @@ typedef struct BTReadPageState
* Input and output parameters, set and unset by both _bt_readpage and
* _bt_checkkeys to manage precheck optimizations
*/
+ bool firstpage; /* on first page of current primitive scan? */
bool prechecked; /* precheck set continuescan to 'true'? */
bool firstmatch; /* at least one match so far? */
@@ -1298,8 +1299,8 @@ extern int _bt_binsrch_array_skey(FmgrInfo *orderproc,
extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
IndexTuple tuple, int tupnatts);
-extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
- IndexTuple finaltup);
+extern bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple finaltup);
extern void _bt_killitems(IndexScanDesc scan);
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
extern BTCycleId _bt_start_vacuum(Relation rel);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 941b4eaaf..12d3145b4 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1558,6 +1558,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.offnum = InvalidOffsetNumber;
pstate.skip = InvalidOffsetNumber;
pstate.continuescan = true; /* default assumption */
+ pstate.firstpage = firstPage;
pstate.prechecked = false;
pstate.firstmatch = false;
pstate.rechecks = 0;
@@ -1603,7 +1604,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* required < or <= strategy scan keys) during the precheck, we can safely
* assume that this must also be true of all earlier tuples from the page.
*/
- if (!firstPage && !so->scanBehind && minoff < maxoff)
+ if (!pstate.firstpage && !so->scanBehind && minoff < maxoff)
{
ItemId iid;
IndexTuple itup;
@@ -1620,36 +1621,24 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
if (ScanDirectionIsForward(dir))
{
/* SK_SEARCHARRAY forward scans must provide high key up front */
- if (arrayKeys && !P_RIGHTMOST(opaque))
+ if (arrayKeys)
{
- ItemId iid = PageGetItemId(page, P_HIKEY);
-
- pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
-
- if (unlikely(so->oppositeDirCheck))
+ if (!P_RIGHTMOST(opaque))
{
- Assert(so->scanBehind);
+ ItemId iid = PageGetItemId(page, P_HIKEY);
- /*
- * Last _bt_readpage call scheduled a recheck of finaltup for
- * required scan keys up to and including a > or >= scan key.
- *
- * _bt_checkkeys won't consider the scanBehind flag unless the
- * scan is stopped by a scan key required in the current scan
- * direction. We need this recheck so that we'll notice when
- * all tuples on this page are still before the _bt_first-wise
- * start of matches for the current set of array keys.
- */
- if (!_bt_oppodir_checkkeys(scan, dir, pstate.finaltup))
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+ if (unlikely(so->scanBehind) &&
+ !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
{
/* Schedule another primitive index scan after all */
so->currPos.moreRight = false;
so->needPrimScan = true;
return false;
}
-
- /* Deliberately don't unset scanBehind flag just yet */
}
+ so->scanBehind = so->oppositeDirCheck = false; /* reset */
}
/* load items[] in ascending order */
@@ -1745,7 +1734,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
- if (pstate.continuescan && !P_RIGHTMOST(opaque))
+ if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
@@ -1767,11 +1756,24 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
else
{
/* SK_SEARCHARRAY backward scans must provide final tuple up front */
- if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+ if (arrayKeys)
{
- ItemId iid = PageGetItemId(page, minoff);
+ if (minoff <= maxoff && !P_LEFTMOST(opaque))
+ {
+ ItemId iid = PageGetItemId(page, minoff);
- pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+ pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+
+ if (unlikely(so->scanBehind) &&
+ !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
+ {
+ /* Schedule another primitive index scan after all */
+ so->currPos.moreLeft = false;
+ so->needPrimScan = true;
+ return false;
+ }
+ }
+ so->scanBehind = so->oppositeDirCheck = false; /* reset */
}
/* load items[] in descending order */
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 693e43c67..f7e5cd02a 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -42,6 +42,8 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
#endif
+static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple finaltup);
static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
bool advancenonrequired, bool prechecked, bool firstmatch,
@@ -874,11 +876,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
all_required_satisfied = true,
all_satisfied = true;
- /*
- * Unset so->scanBehind (and so->oppositeDirCheck) in case they're still
- * set from back when we dealt with the previous page's high key/finaltup
- */
- so->scanBehind = so->oppositeDirCheck = false;
+ Assert(!so->scanBehind && !so->oppositeDirCheck);
if (sktrig_required)
{
@@ -1387,6 +1385,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
* for next page's finaltup (and we skip it for this page's finaltup).
*/
so->oppositeDirCheck = true; /* recheck next page's high key */
+
+ /*
+ * Make sure that any non-required arrays are set to the first array
+ * element for the current scan direction
+ */
+ _bt_rewind_nonrequired_arrays(scan, dir);
}
/*
@@ -1429,14 +1433,12 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
(all_required_satisfied || oppodir_inequality_sktrig) &&
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
{
- /*
- * Make sure that any non-required arrays are set to the first array
- * element for the current scan direction
- */
_bt_rewind_nonrequired_arrays(scan, dir);
goto new_prim_scan;
}
+continue_scan:
+
/*
* Stick with the ongoing primitive index scan for now.
*
@@ -1458,8 +1460,10 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
if (so->scanBehind)
{
/* Optimization: skip by setting "look ahead" mechanism's offnum */
- Assert(ScanDirectionIsForward(dir));
- pstate->skip = pstate->maxoff + 1;
+ if (ScanDirectionIsForward(dir))
+ pstate->skip = pstate->maxoff + 1;
+ else
+ pstate->skip = pstate->minoff - 1;
}
/* Caller's tuple doesn't match the new qual */
@@ -1469,6 +1473,37 @@ new_prim_scan:
Assert(pstate->finaltup); /* not on rightmost/leftmost page */
+ /*
+ * Looks like another primitive index scan is required. But consider
+ * backing out and continuing the primscan based on scan-level heuristics.
+ *
+ * Continue the ongoing primitive scan (but schedule a recheck for when
+ * the scan arrives on the next sibling leaf page) when it has already
+ * read at least one leaf page before the one we're reading now. This is
+ * important when reading subsets of an index with many distinct values in
+ * respect of an attribute constrained by an array. It encourages fewer,
+ * larger primitive scans where that makes sense.
+ *
+ * Note: This heuristic isn't as aggressive as you might think. We're
+ * conservative about allowing a primitive scan to step from the first
+ * leaf page it reads to the page's sibling page (we only allow it on
+ * first pages whose finaltup strongly suggests that it'll work out).
+ * Clearing this first page finaltup hurdle is a strong signal in itself.
+ */
+ if (!pstate->firstpage)
+ {
+ /* Schedule a recheck once on the next (or previous) page */
+ so->scanBehind = true;
+ if (has_required_opposite_direction_only)
+ so->oppositeDirCheck = true;
+
+ /* Defensively reset any nonrequired SAOP arrays */
+ _bt_rewind_nonrequired_arrays(scan, dir);
+
+ /* Continue the current primitive scan after all */
+ goto continue_scan;
+ }
+
/*
* End this primitive index scan, but schedule another.
*
@@ -1634,6 +1669,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
bool res;
Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
+ Assert(!so->scanBehind && !so->oppositeDirCheck);
res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
arrayKeys, pstate->prechecked, pstate->firstmatch,
@@ -1688,62 +1724,36 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
ikey, NULL))
{
+ /* Override _bt_check_compare, continue primitive scan */
+ pstate->continuescan = true;
+
/*
- * Tuple is still before the start of matches according to the scan's
- * required array keys (according to _all_ of its required equality
- * strategy keys, actually).
+ * We will end up here repeatedly given a group of tuples > the
+ * previous array keys and < the now-current keys (for a backwards
+ * scan it's just the same, though the operators swap positions).
*
- * _bt_advance_array_keys occasionally sets so->scanBehind to signal
- * that the scan's current position/tuples might be significantly
- * behind (multiple pages behind) its current array keys. When this
- * happens, we need to be prepared to recover by starting a new
- * primitive index scan here, on our own.
+ * We must avoid allowing this linear search process to scan very many
+ * tuples from well before the start of tuples matching the current
+ * array keys (or from well before the point where we'll once again
+ * have to advance the scan's array keys).
+ *
+ * We keep the overhead under control by speculatively "looking ahead"
+ * to later still-unscanned items from this same leaf page. We'll only
+ * attempt this once the number of tuples that the linear search
+ * process has examined starts to get out of hand.
*/
- Assert(!so->scanBehind ||
- so->keyData[ikey].sk_strategy == BTEqualStrategyNumber);
- if (unlikely(so->scanBehind) && pstate->finaltup &&
- _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
- BTreeTupleGetNAtts(pstate->finaltup,
- scan->indexRelation),
- false, 0, NULL))
+ pstate->rechecks++;
+ if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
{
- /* Cut our losses -- start a new primitive index scan now */
- pstate->continuescan = false;
- so->needPrimScan = true;
- }
- else
- {
- /* Override _bt_check_compare, continue primitive scan */
- pstate->continuescan = true;
+ /* See if we should skip ahead within the current leaf page */
+ _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
/*
- * We will end up here repeatedly given a group of tuples > the
- * previous array keys and < the now-current keys (for a backwards
- * scan it's just the same, though the operators swap positions).
- *
- * We must avoid allowing this linear search process to scan very
- * many tuples from well before the start of tuples matching the
- * current array keys (or from well before the point where we'll
- * once again have to advance the scan's array keys).
- *
- * We keep the overhead under control by speculatively "looking
- * ahead" to later still-unscanned items from this same leaf page.
- * We'll only attempt this once the number of tuples that the
- * linear search process has examined starts to get out of hand.
+ * Might have set pstate.skip to a later page offset. When that
+ * happens then _bt_readpage caller will inexpensively skip ahead
+ * to a later tuple from the same page (the one just after the
+ * tuple we successfully "looked ahead" to).
*/
- pstate->rechecks++;
- if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
- {
- /* See if we should skip ahead within the current leaf page */
- _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
-
- /*
- * Might have set pstate.skip to a later page offset. When
- * that happens then _bt_readpage caller will inexpensively
- * skip ahead to a later tuple from the same page (the one
- * just after the tuple we successfully "looked ahead" to).
- */
- }
}
/* This indextuple doesn't match the current qual, in any case */
@@ -1760,6 +1770,39 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
ikey, true);
}
+/*
+ * Test whether finaltup (the final tuple on the page) is still before the
+ * start of matches for the current array keys.
+ *
+ * Caller's finaltup tuple is the page high key (for forwards scans), or the
+ * first non-pivot tuple (for backwards scans). Called during scans with
+ * array keys when the so->scanBehind flag was set on the previous page.
+ *
+ * Returns false if the tuple is still before the start of matches. When that
+ * happens caller should cut its losses and start a new primitive index scan.
+ * Otherwise returns true.
+ */
+bool
+_bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir,
+ IndexTuple finaltup)
+{
+ Relation rel = scan->indexRelation;
+ TupleDesc tupdesc = RelationGetDescr(rel);
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
+
+ Assert(so->numArrayKeys);
+
+ if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc,
+ nfinaltupatts, false, 0, NULL))
+ return false;
+
+ if (!so->oppositeDirCheck)
+ return true;
+
+ return _bt_oppodir_checkkeys(scan, dir, finaltup);
+}
+
/*
* Test whether an indextuple fails to satisfy an inequality required in the
* opposite direction only.
@@ -1778,7 +1821,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys,
* _bt_checkkeys to stop the scan to consider array advancement/starting a new
* primitive index scan.
*/
-bool
+static bool
_bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple finaltup)
{
--
2.47.2