From a681977fc52ef25d58593ebee08d97d3f1ca84bb Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 16 Nov 2024 15:58:41 -0500 Subject: [PATCH v20 3/3] Lower the overhead of nbtree runtime skip checks. Add a new "skipskip" strategy to fix regressions in index scans that are nominally eligible to use skip scan, but can never actually benefit from skipping. These are cases where a leading prefix column contains many distinct values -- typically as many distinct values are there are total index tuples. This works by dynamically falling back on a strategy that temporarily treats all user-supplied scan keys as if they were marked non-required, while avoiding all skip array maintenance. The new optimization is applied for all pages beyond each primitive index scan's first leaf page read. Note that this commit doesn't actually change anything about when or how skip scan decides to schedule new primitive index scans. It is limited to saving CPU cycles by varying how we read individual index tuples in cases where maintaining skip arrays cannot possibly pay for itself. Fixing (or significantly ameliorating) regressions in the worst case like this enables skip scan's approach within the planner. The planner doesn't generate distinct index paths to represent nbtree index skip scans; whether and to what extent a scan actually skips is always determined dynamically, at runtime. The planner considers skipping when costing relevant index paths, but that in itself won't influence skip scan's runtime behavior. This approach makes skip scan adapt to skewed key distributions. It also makes planning less sensitive to misestimations. An index path with an inaccurate estimate of the total number of required primitive index scans won't have to perform many more primitive scans at runtime (it'll behave like a traditional full index scan instead). Author: Peter Geoghegan Reviewed-By: Masahiro Ikeda Discussion: https://postgr.es/m/CAH2-Wz=Y93jf5WjoOsN=xvqpMjRy-bxCE037bVFi-EasrpeUJA@mail.gmail.com --- src/include/access/nbtree.h | 3 + src/backend/access/nbtree/nbtsearch.c | 55 ++++++++ src/backend/access/nbtree/nbtutils.c | 195 +++++++++++++++++++++----- 3 files changed, 216 insertions(+), 37 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9c53ba7e9..e4ee13dd0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1111,6 +1111,7 @@ typedef struct BTReadPageState */ bool prechecked; /* precheck set continuescan to 'true'? */ bool firstmatch; /* at least one match so far? */ + bool skipskip; /* skip maintenance of skip arrays? */ /* * Private _bt_checkkeys state used to manage "look ahead" optimization @@ -1306,6 +1307,8 @@ extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arra IndexTuple tuple, int tupnatts); extern bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup); +extern bool _bt_checkkeys_skipskip(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, TupleDesc tupdesc); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d332549cb..792146e7b 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1652,6 +1652,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, pstate.continuescan = true; /* default assumption */ pstate.prechecked = false; pstate.firstmatch = false; + pstate.skipskip = false; pstate.rechecks = 0; pstate.targetdistance = 0; @@ -1744,6 +1745,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, /* Deliberately don't unset scanBehind flag just yet */ } + + /* + * Skip maintenance of skip arrays (if any) during primitive index + * scans that read leaf pages after the first + */ + if (so->skipScan && !firstPage) + { + IndexTuple itup; + + iid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, iid); + if (_bt_checkkeys_skipskip(scan, &pstate, itup, + RelationGetDescr(rel))) + { + pstate.skipskip = true; + } + } } /* load items[] in ascending order */ @@ -1847,6 +1865,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, truncatt = BTreeTupleGetNAtts(itup, rel); pstate.prechecked = false; /* precheck didn't cover HIKEY */ + if (pstate.skipskip) + { + /* + * reset array keys for finaltup call, since skipskip + * optimization prevented ordinary array maintenance + */ + Assert(so->skipScan); + _bt_start_array_keys(scan, dir); + pstate.skipskip = false; + } _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } @@ -1866,6 +1894,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, ItemId iid = PageGetItemId(page, minoff); pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + + /* + * Skip maintenance of skip arrays (if any) during primitive index + * scans that read leaf pages after the first + */ + if (so->skipScan && !firstPage) + { + IndexTuple itup; + + iid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, iid); + if (_bt_checkkeys_skipskip(scan, &pstate, itup, + RelationGetDescr(rel))) + { + pstate.skipskip = true; + } + } } /* load items[] in descending order */ @@ -1907,6 +1952,16 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, Assert(!BTreeTupleIsPivot(itup)); pstate.offnum = offnum; + if (offnum == minoff && pstate.skipskip) + { + /* + * reset array keys for finaltup call, since skipskip + * optimization prevented ordinary array maintenance + */ + Assert(so->skipScan); + _bt_start_array_keys(scan, dir); + pstate.skipskip = false; + } passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index ea8b34049..9d4f31dd0 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -151,11 +151,12 @@ static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool skipskip, + bool prechecked, bool firstmatch, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); + ScanDirection dir, bool skipskip, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -3104,14 +3105,14 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) * postcondition's <= operator with a >=. In other words, just swap the * precondition with the postcondition.) * - * We also deal with "advancing" non-required arrays here. Callers whose - * sktrig scan key is non-required specify sktrig_required=false. These calls - * are the only exception to the general rule about always advancing the - * required array keys (the scan may not even have a required array). These - * callers should just pass a NULL pstate (since there is never any question - * of stopping the scan). No call to _bt_tuple_before_array_skeys is required - * ahead of these calls (it's already clear that any required scan keys must - * be satisfied by caller's tuple). + * We also deal with "advancing" non-required arrays here (or arrays that are + * temporarily being treated as non-required due to the application of the + * "skipskip" optimization). Callers whose sktrig scan key is non-required + * specify sktrig_required=false. These calls are the only exception to the + * general rule about always advancing the required array keys (the scan may + * not even have a required array). These callers should just pass a NULL + * pstate (since there is never any question of stopping the scan). No call + * to _bt_tuple_before_array_skeys is required ahead of these calls. * * Note that we deal with non-array required equality strategy scan keys as * degenerate single element arrays here. Obviously, they can never really @@ -3168,8 +3169,8 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ pstate->firstmatch = false; - /* Shouldn't have to invalidate 'prechecked', though */ - Assert(!pstate->prechecked); + /* Shouldn't have to invalidate 'prechecked' or 'skipskip', though */ + Assert(!pstate->prechecked && !pstate->skipskip); /* * Once we return we'll have a new set of required array keys, so @@ -3221,8 +3222,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) { - Assert(sktrig_required); - required = true; if (cur->sk_attno > tupnatts) @@ -3369,7 +3368,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } else { - Assert(sktrig_required && required); + Assert(required); /* * This is a required non-array equality strategy scan key, which @@ -3411,7 +3410,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * be eliminated by _bt_preprocess_keys. It won't matter if some of * our "true" array scan keys (or even all of them) are non-required. */ - if (required && + if (sktrig_required && required && ((ScanDirectionIsForward(dir) && result > 0) || (ScanDirectionIsBackward(dir) && result < 0))) beyond_end_advance = true; @@ -3426,7 +3425,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, * array scan keys are considered interesting.) */ all_satisfied = false; - if (required) + if (sktrig_required && required) all_required_satisfied = false; else { @@ -3539,7 +3538,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, /* Recheck _bt_check_compare on behalf of caller */ if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, + false, !sktrig_required, false, false, &continuescan, &nsktrig) && !so->scanBehind) { @@ -4909,7 +4908,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - arrayKeys, pstate->prechecked, pstate->firstmatch, + arrayKeys, pstate->skipskip, + pstate->prechecked, pstate->firstmatch, &pstate->continuescan, &ikey); #ifdef USE_ASSERT_CHECKING @@ -4921,7 +4921,8 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * Assert that the scan isn't in danger of becoming confused. */ Assert(!so->scanBehind && !so->oppositeDirCheck); - Assert(!pstate->prechecked && !pstate->firstmatch); + Assert(!pstate->skipskip && !pstate->prechecked && + !pstate->firstmatch); Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, false, 0, NULL)); } @@ -4935,7 +4936,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * get the same answer without those optimizations */ Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, - false, false, false, + false, pstate->skipskip, false, false, &dcontinuescan, &dikey)); Assert(pstate->continuescan == dcontinuescan); } @@ -4958,6 +4959,7 @@ _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, * It's also possible that the scan is still _before_ the _start_ of * tuples matching the current set of array keys. Check for that first. */ + Assert(!pstate->skipskip); if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, ikey, NULL)) { @@ -5066,7 +5068,7 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, Assert(so->numArrayKeys); _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, - false, false, false, &continuescan, &ikey); + false, false, false, false, &continuescan, &ikey); if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) return false; @@ -5105,17 +5107,24 @@ _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, * * Though we advance non-required array keys on our own, that shouldn't have * any lasting consequences for the scan. By definition, non-required arrays - * have no fixed relationship with the scan's progress. (There are delicate - * considerations for non-required arrays when the arrays need to be advanced - * following our setting continuescan to false, but that doesn't concern us.) + * have no fixed relationship with the scan's progress. (skipskip=true makes + * us treat required scan keys as non-required, though. That's safe because + * _bt_readpage will account for our failure to fully maintain the scan's + * arrays later on, right before its finaltup call to _bt_checkkeys.) * * Pass advancenonrequired=false to avoid all array related side effects. * This allows _bt_advance_array_keys caller to avoid infinite recursion. + * + * Pass skipskip=true to instruct us to skip all of the scan's skip arrays. + * This provides the scan with a way of keeping the cost of maintaining its + * skip arrays under control, given skip arrays on high cardinality columns + * (i.e. given a skip array on a column which isn't a good fit for skip scan). */ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - bool advancenonrequired, bool prechecked, bool firstmatch, + bool advancenonrequired, bool skipskip, + bool prechecked, bool firstmatch, bool *continuescan, int *ikey) { BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -5132,10 +5141,18 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* * Check if the key is required in the current scan direction, in the - * opposite scan direction _only_, or in neither direction + * opposite scan direction _only_, or in neither direction (except + * when reading a page that's now using the "skipskip" optimization) */ - if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + if (skipskip) + { + Assert(!prechecked); + + if (key->sk_flags & SK_BT_SKIP) + continue; + } + else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) @@ -5193,7 +5210,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) + skipskip, continuescan)) continue; return false; } @@ -5248,7 +5265,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * forward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -5266,7 +5283,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, * (_bt_advance_array_keys also relies on this behavior during * backward scans.) */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if ((requiredSameDir || requiredOppositeDirOnly) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -5335,7 +5352,8 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, bool skipskip, + bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -5375,7 +5393,11 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { - if (subkey->sk_flags & SK_BT_NULLS_FIRST) + if (skipskip) + { + /* treat scan key as non-required during skipskip calls */ + } + else if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have @@ -5428,8 +5450,12 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, */ if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) subkey--; - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) + if (skipskip) + { + /* treat scan key as non-required during skipskip calls */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) @@ -5481,7 +5507,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, break; } - if (!result) + if (!result && !skipskip) { /* * Tuple fails this qual. If it's a required qual for the current @@ -5525,6 +5551,8 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, OffsetNumber aheadoffnum; IndexTuple ahead; + Assert(!pstate->skipskip); + /* Avoid looking ahead when comparing the page high key */ if (pstate->offnum < pstate->minoff) return; @@ -5585,6 +5613,99 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, } } +/* + * Can _bt_checkkeys/_bt_check_compare apply the 'skipskip' optimization? + * + * Return value indicates if the optimization is safe for the tuples on the + * page after caller's tuple, but before its page's finaltup. + */ +bool +_bt_checkkeys_skipskip(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, TupleDesc tupdesc) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = so->currPos.dir; + IndexTuple finaltup = pstate->finaltup; + int arrayidx = 0, + nfinaltupatts = 0; + bool rangearrayseen = false; + + Assert(!BTreeTupleIsPivot(tuple)); + Assert(tuple != finaltup); + + if (finaltup) + nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + Datum tupdatum; + bool tupnull; + int32 result; + + /* + * Only need to check range skip arrays within this loop. + * + * A SAOP array can always be treated as a non-required array within + * _bt_check_compare. A skip array without a lower or upper bound is + * always safe to skip within _bt_check_compare, since it is satisfied + * by every possible value. + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + if (!(cur->sk_flags & SK_SEARCHARRAY)) + continue; + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + if (array->num_elems != -1 || array->null_elem) + continue; + + /* + * Found a range skip array to test. + * + * Scans with more than one range skip array are not eligible to use + * the optimization. Note that we support the skipskip optimization + * for a qual like "WHERE a BETWEEN 1 AND 10 AND b BETWEEN 1 AND 3", + * since there the qual actually requires only a single skip array. + * However, if such a qual ended with "... AND C > 42", then it will + * prevent use of the skipskip optimization. + */ + if (rangearrayseen) + return false; + + /* + * Don't attempt the optimization when we have a skip array and are + * reading the rightmost leaf page (or the leftmost leaf page, when + * scanning backwards) + */ + if (!finaltup) + return false; + rangearrayseen = true; + + /* test the tuple that just advanced arrays within our caller */ + Assert(cur->sk_flags & SK_BT_SKIP); + Assert(cur->sk_flags & SK_BT_REQFWD); + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], false, dir, + tupdatum, tupnull, array, cur, &result); + if (result != 0) + return false; + + /* test the page's finaltup iff relevant attribute isn't truncated */ + if (cur->sk_attno > nfinaltupatts) + continue; + + tupdatum = index_getattr(finaltup, cur->sk_attno, tupdesc, &tupnull); + _bt_binsrch_skiparray_skey(&so->orderProcs[ikey], false, dir, + tupdatum, tupnull, array, cur, &result); + if (result != 0) + return false; + } + + return true; +} + /* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed -- 2.45.2