v5-0001-Return-TIDs-in-desc-order-during-backwards-scans.patch
application/x-patch
Filename: v5-0001-Return-TIDs-in-desc-order-during-backwards-scans.patch
Type: application/x-patch
Part: 0
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v5-0001
Subject: Return TIDs in desc order during backwards scans.
| File | + | − |
|---|---|---|
| src/backend/access/nbtree/nbtreadpage.c | 24 | 33 |
| src/backend/access/nbtree/nbtutils.c | 29 | 8 |
From 9149ce1e425f17bd965d361f85c6b98a54535088 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 15 Jun 2025 18:06:57 -0400
Subject: [PATCH v5] Return TIDs in desc order during backwards scans.
Always return TIDs in descending order when returning a group of
duplicates to the scan whose TIDs come from an nbtree posting list
tuples during nbtree backwards scans. This makes backwards scans tend
to require fewer buffer hits, since the scan is less likely to
repeatedly pin and unpin the same heap page/buffer (we'll get exactly as
many buffer hits as we get with a similar forwards scan case).
Commit 0d861bbb, which added nbtree deduplication, originally did things
this way to avoid interfering with _bt_killitems's previous approach to
setting LP_DEAD bits on posting list tuples. _bt_killitems made a soft
assumption that it can iterate through posting lists in TID order,
finding corresponding killItems[]/so->currPos.items[] entries in that
same order (even when scanning backwards). This worked out because of
the prior _bt_readpage backwards scan behavior. If we just change the
backwards scan posting list logic in _bt_readpage, and don't alter
_bt_killitems itself, we'll break its soft assumption.
Avoid that problem by sorting the so->killedItems[] array at the start
of _bt_killitems. That way the order that dead items are saved in from
btgettuple can't matter; so->killedItems[] will always be in the same
order as so->currPos.items[] in the end. Since so->currPos.items[] is
now always in leaf page order, regardless of the scan direction used
within _bt_readpage, and since so->killedItems[] is always in that same
order, the _bt_killitems loop can continue to make a uniform assumption
about everything being in page order.
Also deduplicate the so->killedItems[] array after it is sorted. That
way there's no risk of the _bt_killitems loop becoming confused by
duplicate dead items/TID. This was possible in cases that involved a
scrollable cursor that encountered the same dead TID more than once
(within the same leaf page/so->currPos context). This doesn't matter
very much in practice, but it seems best to be as consistent as possible
about LP_DEAD marking items within _bt_killitems.
Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Mircea Cadariu <cadariu.mircea@gmail.com>
Reviewed-By: Victor Yegorov <vyegorov@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=Wut2pKvbW-u3hJ_LXwsYeiXHiW8oN1GfbKPavcGo8Ow@mail.gmail.com
---
src/backend/access/nbtree/nbtreadpage.c | 57 +++++++++++--------------
src/backend/access/nbtree/nbtutils.c | 37 ++++++++++++----
2 files changed, 53 insertions(+), 41 deletions(-)
diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c
index ac67b6f7a..b3b8b5534 100644
--- a/src/backend/access/nbtree/nbtreadpage.c
+++ b/src/backend/access/nbtree/nbtreadpage.c
@@ -163,19 +163,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
Assert(BTScanPosIsPinned(so->currPos));
Assert(!so->needPrimScan);
- if (scan->parallel_scan)
- {
- /* allow next/prev page to be read by other worker without delay */
- if (ScanDirectionIsForward(dir))
- _bt_parallel_release(scan, so->currPos.nextPage,
- so->currPos.currPage);
- else
- _bt_parallel_release(scan, so->currPos.prevPage,
- so->currPos.currPage);
- }
-
- PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
-
/* initialize local variables */
indnatts = IndexRelationGetNumberOfAttributes(rel);
arrayKeys = so->numArrayKeys != 0;
@@ -197,6 +184,19 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.targetdistance = 0;
pstate.nskipadvances = 0;
+ if (scan->parallel_scan)
+ {
+ /* allow next/prev page to be read by other worker without delay */
+ if (ScanDirectionIsForward(dir))
+ _bt_parallel_release(scan, so->currPos.nextPage,
+ so->currPos.currPage);
+ else
+ _bt_parallel_release(scan, so->currPos.prevPage,
+ so->currPos.currPage);
+ }
+
+ PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
+
if (ScanDirectionIsForward(dir))
{
/* SK_SEARCHARRAY forward scans must provide high key up front */
@@ -208,7 +208,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
- if (so->scanBehind &&
+ if (unlikely(so->scanBehind) &&
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
{
/* Schedule another primitive index scan after all */
@@ -287,16 +287,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
{
int tupleOffset;
- /*
- * Set up state to return posting list, and remember first
- * TID
- */
+ /* Set up posting list state (and remember first TID) */
tupleOffset =
_bt_setuppostingitems(so, itemIndex, offnum,
BTreeTupleGetPostingN(itup, 0),
itup);
itemIndex++;
- /* Remember additional TIDs */
+
+ /* Remember all later TIDs (must be at least one) */
for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
{
_bt_savepostingitem(so, itemIndex, offnum,
@@ -359,7 +357,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
- if (so->scanBehind &&
+ if (unlikely(so->scanBehind) &&
!_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
{
/* Schedule another primitive index scan after all */
@@ -472,25 +470,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
}
else
{
+ uint16 nitems = BTreeTupleGetNPosting(itup);
int tupleOffset;
- /*
- * Set up state to return posting list, and remember first
- * TID.
- *
- * Note that we deliberately save/return items from
- * posting lists in ascending heap TID order for backwards
- * scans. This allows _bt_killitems() to make a
- * consistent assumption about the order of items
- * associated with the same posting list tuple.
- */
+ /* Set up posting list state (and remember last TID) */
itemIndex--;
tupleOffset =
_bt_setuppostingitems(so, itemIndex, offnum,
- BTreeTupleGetPostingN(itup, 0),
+ BTreeTupleGetPostingN(itup, nitems - 1),
itup);
- /* Remember additional TIDs */
- for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+
+ /* Remember all prior TIDs (must be at least one) */
+ for (int i = nitems - 2; i >= 0; i--)
{
itemIndex--;
_bt_savepostingitem(so, itemIndex, offnum,
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 33b0e4055..837c30690 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -21,12 +21,15 @@
#include "access/reloptions.h"
#include "access/relscan.h"
#include "commands/progress.h"
+#include "common/int.h"
+#include "lib/qunique.h"
#include "miscadmin.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+static int _bt_compare_int(const void *va, const void *vb);
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
IndexTuple firstright, BTScanInsert itup_key);
@@ -157,6 +160,18 @@ _bt_freestack(BTStack stack)
}
}
+/*
+ * qsort comparison function for int arrays
+ */
+static int
+_bt_compare_int(const void *va, const void *vb)
+{
+ int a = *((const int *) va);
+ int b = *((const int *) vb);
+
+ return pg_cmp_s32(a, b);
+}
+
/*
* _bt_killitems - set LP_DEAD state for items an indexscan caller has
* told us were killed
@@ -206,6 +221,20 @@ _bt_killitems(IndexScanDesc scan)
/* Always invalidate so->killedItems[] before leaving so->currPos */
so->numKilled = 0;
+ /*
+ * so->killedItems[] is in whatever order the scan returned items in.
+ * Items will appear in descending order during backwards scans. And
+ * scrollable cursor scans might have duplicate items.
+ *
+ * Sort and uniqueify so->killedItems[] to deal with all this.
+ */
+ if (numKilled > 1)
+ {
+ qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
+ numKilled = qunique(so->killedItems, numKilled, sizeof(int),
+ _bt_compare_int);
+ }
+
if (!so->dropPin)
{
/*
@@ -265,14 +294,6 @@ _bt_killitems(IndexScanDesc scan)
int j;
/*
- * We rely on the convention that heap TIDs in the scanpos
- * items array are stored in ascending heap TID order for a
- * group of TIDs that originally came from a posting list
- * tuple. This convention even applies during backwards
- * scans, where returning the TIDs in descending order might
- * seem more natural. This is about effectiveness, not
- * correctness.
- *
* Note that the page may have been modified in almost any way
* since we first read it (in the !so->dropPin case), so it's
* possible that this posting list tuple wasn't a posting list
--
2.51.0