From 3ad6aa3dbdf79847da7c78367b88d81bbe87cf39 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 16 Nov 2024 15:58:41 -0500 Subject: [PATCH v17 3/3] Add "skipskip" nbtree skip scan optimization. Fix regressions in cases that are nominally eligible to use skip scan but can never actually benefit from skipping. These are cases where the leading skipped prefix column contains many distinct values -- often as many distinct values are there are total index tuples. For example, the test case posted here is fixed by the work from this commit: https://postgr.es/m/51d00219180323d121572e1f83ccde2a@oss.nttdata.com Note that this commit doesn't actually change anything about when or how skip scan decides when or how to skip. It just avoids wasting CPU cycles on uselessly maintaining a skip array at the tuple granularity, preferring to maintain the skip arrays at something closer to the page granularity when that makes sense. See: https://www.postgresql.org/message-id/flat/CAH2-Wz%3DE7XrkvscBN0U6V81NK3Q-dQOmivvbEsjG-zwEfDdFpg%40mail.gmail.com#7d34e8aa875d7a718043834c5ef4c167 Doing well on cases like this is important because we can't expect the optimizer to never choose an affected plan -- we prefer to solve these problems in the executor, which has access to the most reliable and current information about the index. The optimizer can afford to be very optimistic about skipping if actual runtime scan behavior is very similar to a traditional full index scan in the worst case. See "optimizer" section from the original intro mail for more information: https://postgr.es/m/CAH2-Wzmn1YsLzOGgjAQZdn1STSG_y8qP__vggTaPAYXJP%2BG4bw%40mail.gmail.com Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Discussion: https://postgr.es/m/CAH2-Wz=Y93jf5WjoOsN=xvqpMjRy-bxCE037bVFi-EasrpeUJA@mail.gmail.com --- src/include/access/nbtree.h | 7 + src/backend/access/nbtree/nbtsearch.c | 20 +++ src/backend/access/nbtree/nbtutils.c | 191 +++++++++++++++++++++++--- 3 files changed, 197 insertions(+), 21 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d841e85bc..c84dcc983 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1111,6 +1111,13 @@ typedef struct BTReadPageState bool prechecked; /* precheck set continuescan to 'true'? */ bool firstmatch; /* at least one match so far? */ + /* + * Input and output parameters, set and unset by both _bt_readpage and + * _bt_checkkeys to manage "skipskip" optimization during skip scans + */ + bool skipskip; + bool noskipskip; + /* * Private _bt_checkkeys state used to manage "look ahead" optimization * (only used during scans with array keys) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 43e321896..468bfb722 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1644,6 +1644,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.continuescan = true; /* default assumption */ pstate.prechecked = false; pstate.firstmatch = false; + pstate.skipskip = false; + pstate.noskipskip = false; pstate.rechecks = 0; pstate.targetdistance = 0; @@ -1686,6 +1688,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * if the final tuple is == those same keys (and also satisfies any * required < or <= strategy scan keys) during the precheck, we can safely * assume that this must also be true of all earlier tuples from the page. + * + * XXX Maybe we should opt out of this optimization during any skip scan? + * Arguably, it is superseded by the "skipskip" skip scan optimization. + * (In any case this isn't very effective during most skip scans.) */ if (!firstPage && !so->scanBehind && minoff < maxoff) { @@ -1837,6 +1843,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, truncatt = BTreeTupleGetNAtts(itup, rel); pstate.prechecked = false; /* precheck didn't cover HIKEY */ + if (pstate.skipskip) + { + Assert(itup == pstate.finaltup); + + _bt_start_array_keys(scan, dir); + pstate.skipskip = false; /* reset for finaltup */ + } _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } @@ -1897,6 +1910,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, Assert(!BTreeTupleIsPivot(itup)); pstate.offnum = offnum; + if (offnum == minoff && pstate.skipskip) + { + Assert(itup == pstate.finaltup); + + _bt_start_array_keys(scan, dir); + pstate.skipskip = false; /* reset for finaltup */ + } passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 9de7c40e8..0a3a3bf32 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -151,11 +151,14 @@ static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool skipskip, + bool prechecked, bool firstmatch, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); +static bool _bt_checkkeys_is_skipskip_safe(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, TupleDesc tupdesc); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -3135,6 +3138,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, ScanDirection dir = so->currPos.dir; int arrayidx = 0; bool beyond_end_advance = false, + has_skip_array = false, has_required_opposite_direction_only = false, oppodir_inequality_sktrig = false, all_required_satisfied = true, @@ -3189,6 +3193,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, { array = &so->arrayKeys[arrayidx++]; Assert(array->scan_key == ikey); + if (array->num_elems == -1) + has_skip_array = true; } } else @@ -3211,8 +3217,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) { - Assert(sktrig_required); - required = true; if (cur->sk_attno > tupnatts) @@ -3359,7 +3363,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } else { - Assert(sktrig_required && required); + Assert(required); /* * This is a required non-array equality strategy scan key, which @@ -3401,7 +3405,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * be eliminated by _bt_preprocess_keys. It won't matter if some of * our "true" array scan keys (or even all of them) are non-required. */ - if (required && + if (sktrig_required && required && ((ScanDirectionIsForward(dir) && result > 0) || (ScanDirectionIsBackward(dir) && result < 0))) beyond_end_advance = true; @@ -3416,7 +3420,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * array scan keys are considered interesting.) */ all_satisfied = false; - if (required) + if (sktrig_required && required) all_required_satisfied = false; else { @@ -3528,7 +3532,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* Recheck _bt_check_compare on behalf of caller */ if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, + false, !sktrig_required, false, false, &continuescan, &nsktrig) && !so->scanBehind) { @@ -3597,6 +3601,9 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, return false; } + /* _bt_readpage must have unset skipskip flag (for finaltup call) */ + Assert(!pstate->skipskip); + /* * Postcondition array state assertion (for still-unsatisfied tuples). * @@ -3766,6 +3773,25 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, pstate->skip = pstate->maxoff + 1; } + /* + * Optimization: if a scan with a skip array doesn't satisfy every + * required key (in practice this is almost always all the scan's keys), + * we assume that this page isn't likely to skip "within" a page using + * _bt_checkkeys_look_ahead. We'll apply the 'skipskip' optimization. + * + * The 'skipskip' optimization allows _bt_checkkeys/_bt_check_compare to + * stop maintaining the scan's skip arrays until we've reached finaltup. + */ + else if (has_skip_array && !pstate->noskipskip && + !all_required_satisfied && tuple != pstate->finaltup) + { + /* Don't consider 'skipskip' optimization more than once per page */ + if (_bt_checkkeys_is_skipskip_safe(scan, pstate, tuple, tupdesc)) + pstate->skipskip = true; + else + pstate->noskipskip = true; + } + /* Caller's tuple doesn't match the new qual */ return false; @@ -3867,7 +3893,8 @@ end_toplevel_scan: * make the scan much more efficient: if there are few distinct values in "x", * we'll be able to skip over many irrelevant leaf pages. (If on the other * hand there are many distinct values in "x" then the scan will degenerate - * into a full index scan at run time.) + * into a full index scan at run time, but we'll be no worse off overall. + * _bt_checkkeys's 'skipskip' optimization keeps the runtime overhead low.) * * If possible, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest indexRelation) == tupnatts); res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - arrayKeys, pstate->prechecked, pstate->firstmatch, + arrayKeys, pstate->skipskip, + pstate->prechecked, pstate->firstmatch, &pstate->continuescan, &ikey); #ifdef USE_ASSERT_CHECKING @@ -4920,7 +4948,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * Assert that the scan isn't in danger of becoming confused. */ Assert(!so->scanBehind && !so->oppositeDirCheck); - Assert(!pstate->prechecked && !pstate->firstmatch); + Assert(!pstate->skipskip && !pstate->prechecked && + !pstate->firstmatch); Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL)); } @@ -4934,7 +4963,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * get the same answer without those optimizations */ Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, + false, pstate->skipskip, false, false, &dcontinuescan, &dikey)); Assert(pstate->continuescan == dcontinuescan); } @@ -5065,7 +5094,7 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, Assert(so->numArrayKeys); _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, - false, false, false, &continuescan, &ikey); + false, false, false, false, &continuescan, &ikey); if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) return false; @@ -5104,17 +5133,24 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, * * Though we advance non-required array keys on our own, that shouldn't have * any lasting consequences for the scan. By definition, non-required arrays - * have no fixed relationship with the scan's progress. (There are delicate - * considerations for non-required arrays when the arrays need to be advanced - * following our setting continuescan to false, but that doesn't concern us.) + * have no fixed relationship with the scan's progress. (skipskip=true makes + * us treat required scan keys as non-required, though. That's safe because + * _bt_readpage will account for our failure to fully maintain the scan's + * arrays later on, right before its finaltup call to _bt_checkkeys.) * * Pass advancenonrequired=false to avoid all array related side effects. * This allows _bt_advance_array_keys caller to avoid infinite recursion. + * + * Pass skipskip=true to instruct us to skip all of the scan's skip arrays. + * This provides the scan with a way of keeping the cost of maintaining its + * skip arrays under control, given skip arrays on high cardinality columns + * (i.e. given a skip array on a column which isn't a good fit for skip scan). */ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool skipskip, + bool prechecked, bool firstmatch, bool *continuescan, int *ikey) { BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -5131,10 +5167,18 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* * Check if the key is required in the current scan direction, in the - * opposite scan direction _only_, or in neither direction + * opposite scan direction _only_, or in neither direction (except + * when reading a page that's now using the "skipskip" optimization) */ - if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + if (skipskip) + { + Assert(!prechecked); + + if (key->sk_flags & SK_BT_SKIP) + continue; + } + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) @@ -5247,7 +5291,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * forward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsBackward(dir)) + !skipskip && ScanDirectionIsBackward(dir)) *continuescan = false; } else @@ -5265,7 +5309,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * backward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsForward(dir)) + !skipskip && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -5499,6 +5543,100 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return result; } +/* + * Can _bt_checkkeys/_bt_check_compare apply the 'skipskip' optimization? + * + * Called when _bt_advance_array_keys finds that no required scan key is + * satisfied during a scan with at least one skip array. + * + * Return value indicates if the optimization is safe for the tuples on the + * page after caller's tuple, but before its page's finaltup. + */ +static bool +_bt_checkkeys_is_skipskip_safe(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, TupleDesc tupdesc) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = so->currPos.dir; + IndexTuple finaltup = pstate->finaltup; + int arrayidx = 0, + nfinaltupatts; + bool rangearrayseen = false; + + Assert(!BTreeTupleIsPivot(tuple)); + + /* + * Don't attempt the optimization when reading the rightmost leaf page + * (unless scanning backwards, when it's the leftmost leaf page instead) + */ + if (!finaltup) + return false; + + nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + Datum tupdatum; + bool tupnull; + int32 result; + + /* + * Only need to check range skip arrays within this loop. + * + * A SAOP array can always be treated as a non-required array within + * _bt_check_compare. A skip array without a lower or upper bound is + * always safe to skip within _bt_check_compare, since it is satisfied + * by every possible value. + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + if (!(cur->sk_flags & SK_SEARCHARRAY)) + continue; + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + if (array->num_elems != -1 || array->null_elem) + continue; + + /* + * Found a range skip array to test. + * + * Scans with more than one range skip array are not eligible to use + * the optimization. Note that we support the skipskip optimization + * for a qual like "WHERE a BETWEEN 1 AND 10 AND b BETWEEN 1 AND 3", + * since there the qual actually requires only a single skip array. + * However, if such a qual ended with "... AND C > 42", then it will + * prevent use of the skipskip optimization. (XXX we could probably be + * smarter than this...but is it worth it?) + */ + if (rangearrayseen) + return false; + rangearrayseen = true; + + /* test the tuple that just advanced arrays within our caller */ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_BT_REQFWD); + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], false, dir, + tupdatum, tupnull, array, cur, &result); + if (result != 0) + return false; + + /* test the page's finaltup iff relevant attribute isn't truncated */ + if (cur->sk_attno > nfinaltupatts) + continue; + + tupdatum = index_getattr(finaltup, cur->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], false, dir, + tupdatum, tupnull, array, cur, &result); + if (result != 0) + return false; + } + + return true; +} + /* * Determine if a scan with array keys should skip over uninteresting tuples. * @@ -5524,6 +5662,17 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, OffsetNumber aheadoffnum; IndexTuple ahead; + /* + * Use of the "look ahead" skipping mechanism is mutually exclusive with + * use of skip scan's similar "skipskip" mechanism + * + * XXX While it's true that we can only use one of the optimization at a + * time, there's still nothing stopping us from trying this one out first. + * Maybe _bt_checkkeys_is_skipskip_safe should be taught to apply a test + * against pstate->rechecks, so as to give this optimization a chance? + */ + Assert(!pstate->skipskip); + /* Avoid looking ahead when comparing the page high key */ if (pstate->offnum < pstate->minoff) return; -- 2.45.2