v1-0003-Provide-support-for-global-Index-Scan-Path.patch
application/octet-stream
Filename: v1-0003-Provide-support-for-global-Index-Scan-Path.patch
Type: application/octet-stream
Part: 3
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v1-0003
Subject: Provide support for global Index Scan Path
| File | + | − |
|---|---|---|
| src/backend/access/index/genam.c | 19 | 0 |
| src/backend/access/index/indexam.c | 238 | 7 |
| src/backend/access/nbtree/nbtree.c | 9 | 1 |
| src/backend/access/nbtree/nbtsearch.c | 59 | 12 |
| src/backend/catalog/partition.c | 0 | 4 |
| src/backend/commands/explain.c | 10 | 2 |
| src/backend/executor/nodeIndexonlyscan.c | 23 | 2 |
| src/backend/executor/nodeIndexscan.c | 14 | 2 |
| src/backend/optimizer/path/allpaths.c | 12 | 0 |
| src/backend/optimizer/path/indxpath.c | 38 | 2 |
| src/backend/optimizer/plan/planmain.c | 3 | 1 |
| src/backend/optimizer/plan/planner.c | 137 | 2 |
| src/backend/optimizer/util/appendinfo.c | 59 | 1 |
| src/backend/optimizer/util/plancat.c | 30 | 13 |
| src/backend/optimizer/util/var.c | 1 | 0 |
| src/backend/parser/parse_utilcmd.c | 1 | 0 |
| src/backend/utils/adt/selfuncs.c | 4 | 0 |
| src/backend/utils/cache/plancache.c | 15 | 0 |
| src/bin/psql/describe.c | 11 | 4 |
| src/include/access/genam.h | 6 | 0 |
| src/include/access/nbtree.h | 3 | 0 |
| src/include/access/relscan.h | 7 | 1 |
| src/include/nodes/pathnodes.h | 21 | 0 |
| src/include/nodes/plannodes.h | 3 | 0 |
| src/include/optimizer/appendinfo.h | 2 | 0 |
From 1f962d31d9f71fafe729b5a25396cdce112b7646 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilipkumar@Dilip.local>
Date: Thu, 15 May 2025 17:39:58 +0530
Subject: [PATCH v1 3/4] Provide support for global Index Scan Path
In previous patches we have added support for creating the global index. Now
in this patch we provided a support in planner to choose a global index scan
and index only scan paths at for the append rel.
Currently we do not have support for selecting a bitmap scan using the global
index. We may do that in future and if we need to do that we need to change
a executor such that we can build a sperate tidmap for each leaf relation while
scanning the global index and then do the bitmap heap scan partition at a time
based on the bitmap.
We also do not support the parallel index scan using the global index. There
is nothing blocking as such but this is still a TODO.
Open Items
- In table_slot_callbacks(), now partiioned table can generate tuple by global
index scan so we need proper slot instead of just assigning a virtual slot.
This handling should be done maybe through AM callback?
---
src/backend/access/index/genam.c | 19 ++
src/backend/access/index/indexam.c | 245 ++++++++++++++++++++++-
src/backend/access/nbtree/nbtree.c | 10 +-
src/backend/access/nbtree/nbtsearch.c | 71 +++++--
src/backend/catalog/partition.c | 4 -
src/backend/commands/explain.c | 12 +-
src/backend/executor/nodeIndexonlyscan.c | 25 ++-
src/backend/executor/nodeIndexscan.c | 16 +-
src/backend/optimizer/path/allpaths.c | 12 ++
src/backend/optimizer/path/indxpath.c | 40 +++-
src/backend/optimizer/plan/planmain.c | 4 +-
src/backend/optimizer/plan/planner.c | 139 ++++++++++++-
src/backend/optimizer/util/appendinfo.c | 60 +++++-
src/backend/optimizer/util/plancat.c | 43 ++--
src/backend/optimizer/util/var.c | 1 +
src/backend/parser/parse_utilcmd.c | 1 +
src/backend/utils/adt/selfuncs.c | 4 +
src/backend/utils/cache/plancache.c | 15 ++
src/bin/psql/describe.c | 15 +-
src/include/access/genam.h | 6 +
src/include/access/nbtree.h | 3 +
src/include/access/relscan.h | 8 +-
src/include/nodes/pathnodes.h | 21 ++
src/include/nodes/plannodes.h | 3 +
src/include/optimizer/appendinfo.h | 2 +
25 files changed, 725 insertions(+), 54 deletions(-)
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index c2b80669aa..13bd1e90b7 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -126,6 +126,25 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
scan->xs_hitup = NULL;
scan->xs_hitupdesc = NULL;
+ /*
+ * Set a flag to indicate a global index scan and create a cache for
+ * partition ID to relation OID lookup. This is necessary because a global
+ * index stores the partition ID along with each tuple, and when fetching a
+ * tuple, we need to convert that partition ID into a relation OID. For
+ * more details, refer to the comments above the PartitionId typedef.
+ */
+ if (RelationIsGlobalIndex(indexRelation))
+ {
+ scan->xs_global_index = true;
+ scan->xs_global_index_cache =
+ create_globalindex_partition_cache(CurrentMemoryContext);
+ }
+ else
+ {
+ scan->xs_global_index = false;
+ scan->xs_global_index_cache = NULL;
+ }
+
return scan;
}
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 3aa1fc92df..4e18d8150d 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -104,11 +104,35 @@ do { \
CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \
} while(0)
+/*
+ * Lookup table from relation oid to the relation descriptor and
+ * IndexFetchTableData structure. Because only once we should call
+ * table_index_fetch_begin() for each partition but in scan->xs_heapfetch we
+ * will overwrite with the current partition so if we come back to the old
+ * partition which we already have scanned once then we should use the same
+ * xs_heapfetch and that we can get from the cache.
+ */
+typedef struct GlobalIndexPartitionCacheData
+{
+ MemoryContext pdir_mcxt;
+ HTAB *pdir_hash;
+} GlobalIndexPartitionCacheData;
+
+typedef struct GlobalIndexPartitionCacheEntry
+{
+ Oid reloid;
+ Relation relation;
+ IndexFetchTableData *heapfetch;
+} GlobalIndexPartitionCacheEntry;
+
static IndexScanDesc index_beginscan_internal(Relation indexRelation,
int nkeys, int norderbys, Snapshot snapshot,
ParallelIndexScanDesc pscan, bool temp_snap);
static inline void validate_relation_kind(Relation r);
-
+static GlobalIndexPartitionCacheEntry *globalindex_partition_entry_lookup(
+ GlobalIndexPartitionCache pdir,
+ Oid relid);
+static void globalindex_partition_cache_reset(GlobalIndexPartitionCache pdir);
/* ----------------------------------------------------------------
* index_ interface functions
@@ -270,12 +294,29 @@ index_beginscan(Relation heapRelation,
* Save additional parameters into the scandesc. Everything else was set
* up by RelationGetIndexScan.
*/
- scan->heapRelation = heapRelation;
scan->xs_snapshot = snapshot;
scan->instrument = instrument;
- /* prepare to fetch index matches from table */
- scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
+ /*
+ * For global index do not set the heapRelation and xs_heapfetch because
+ * while scanning the index we might get tids belongs to different
+ * partitions so we will initialize these fields when we actually fetch the
+ * tid from the index as that time we will know the relation oid from where
+ * we need to fetch the tid.
+ */
+ if (scan->xs_global_index)
+ {
+ scan->heapRelation = NULL;
+ scan->xs_heapfetch = NULL;
+ }
+ else
+ {
+ scan->heapRelation = heapRelation;
+
+ /* prepare to fetch index matches from table */
+ scan->xs_heapfetch = table_index_fetch_begin(heapRelation);
+ }
+
return scan;
}
@@ -365,7 +406,23 @@ index_rescan(IndexScanDesc scan,
Assert(norderbys == scan->numberOfOrderBys);
/* Release resources (like buffer pins) from table accesses */
- if (scan->xs_heapfetch)
+ if (scan->xs_global_index)
+ {
+ /*
+ * For the global index, also reset the xs_global_index_cache.
+ * Essentially, the global index will have multiple entries of
+ * xs_heapfetch corresponding to each partition. These entries will be
+ * reset inside globalindex_partition_cache_reset(). Here, we can
+ * simply set xs_heapfetch and heapRelation to NULL in the scan
+ * descriptor. For more details, refer to the comments inside
+ * index_beginscan().
+ */
+ scan->heapRelation = NULL;
+ scan->xs_heapfetch = NULL;
+ if (scan->xs_global_index_cache)
+ globalindex_partition_cache_reset(scan->xs_global_index_cache);
+ }
+ else if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
scan->kill_prior_tuple = false; /* for safety */
@@ -386,7 +443,18 @@ index_endscan(IndexScanDesc scan)
CHECK_SCAN_PROCEDURE(amendscan);
/* Release resources (like buffer pins) from table accesses */
- if (scan->xs_heapfetch)
+ if (scan->xs_global_index)
+ {
+ /*
+ * For global index also reset the cache, interanlly this will
+ * deallocate the index fetch handle for each partition.
+ */
+ if (scan->xs_global_index_cache)
+ globalindex_partition_cache_destroy(scan->xs_global_index_cache);
+ scan->heapRelation = NULL;
+ scan->xs_heapfetch = NULL;
+ }
+ else if (scan->xs_heapfetch)
{
table_index_fetch_end(scan->xs_heapfetch);
scan->xs_heapfetch = NULL;
@@ -442,7 +510,18 @@ index_restrpos(IndexScanDesc scan)
CHECK_SCAN_PROCEDURE(amrestrpos);
/* release resources (like buffer pins) from table accesses */
- if (scan->xs_heapfetch)
+ if (scan->xs_global_index)
+ {
+ /*
+ * For global index also reset the cache, interanlly this will reset
+ * the index fetch handle for each partition.
+ */
+ if (scan->xs_global_index_cache)
+ globalindex_partition_cache_reset(scan->xs_global_index_cache);
+ scan->heapRelation = NULL;
+ scan->xs_heapfetch = NULL;
+ }
+ else if (scan->xs_heapfetch)
table_index_fetch_reset(scan->xs_heapfetch);
scan->kill_prior_tuple = false; /* for safety */
@@ -742,6 +821,15 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
* the index.
*/
Assert(ItemPointerIsValid(&scan->xs_heaptid));
+
+ /*
+ * For global index we need to get the heapoid of the parittion
+ * relation from the scan descriptor stored by index scan and fetch the
+ * tuple from that relation.
+ */
+ if (scan->xs_global_index)
+ global_indexscan_setup_partrel(scan);
+
if (index_fetch_heap(scan, slot))
return true;
}
@@ -1085,3 +1173,146 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
return build_local_reloptions(&relopts, attoptions, validate);
}
+
+/*
+ * Helper function for index_getnext_slot() and IndexOnlyNext for setting up
+ * a proper scan->heapRelation and scan->xs_heapfetch during global index scan
+ * as global index will return tids which belongs to different partitions.
+ */
+void
+global_indexscan_setup_partrel(IndexScanDesc scan)
+{
+ Oid relid;
+ GlobalIndexPartitionCacheEntry *entry;
+
+ relid = scan->xs_heapoid;
+
+ /*
+ * During a global index scan, we might encounter index entries that belong
+ * to different partitions, which could be interleaved. Each time we get
+ * a new index tuple, we need to verify if the scan->heapRelation matches
+ * the relid of that tuple. If it does not, we fetch the corresponding
+ * entry from the cache and store it in the scan descriptor.
+ */
+ if (scan->heapRelation == NULL)
+ {
+ entry = globalindex_partition_entry_lookup(
+ scan->xs_global_index_cache, relid);
+
+ scan->heapRelation = entry->relation;
+ scan->xs_heapfetch = entry->heapfetch;
+ }
+ else if (scan->heapRelation &&
+ relid != RelationGetRelid(scan->heapRelation))
+ {
+ table_index_fetch_reset(scan->xs_heapfetch);
+
+ entry = globalindex_partition_entry_lookup(
+ scan->xs_global_index_cache, relid);
+ scan->heapRelation = entry->relation;
+ scan->xs_heapfetch = entry->heapfetch;
+ }
+}
+
+/*
+ * create_globalindex_partition_cache - Create index scan partition cache
+ *
+ * For more details about this cache refer comments atop
+ * GlobalIndexPartitionCacheData structure.
+ */
+GlobalIndexPartitionCache
+create_globalindex_partition_cache(MemoryContext mcxt)
+{
+ MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
+ GlobalIndexPartitionCache pdir;
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(HASHCTL));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(GlobalIndexPartitionCacheEntry);
+ ctl.hcxt = mcxt;
+
+ pdir = palloc(sizeof(GlobalIndexPartitionCacheData));
+ pdir->pdir_mcxt = mcxt;
+ pdir->pdir_hash = hash_create("globalIndex partitionId cache", 256, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ MemoryContextSwitchTo(oldcontext);
+ return pdir;
+}
+
+/*
+ * globalindex_partition_entry_lookup
+ *
+ * Lookup the relation descriptor and index heap fetch handle for the given
+ * relid. If the entry is not found, it will open the relation, initialize the
+ * index fetch on that relation, and store it in the cache for subsequent
+ * references.
+ */
+static GlobalIndexPartitionCacheEntry *
+globalindex_partition_entry_lookup(GlobalIndexPartitionCache pdir, Oid relid)
+{
+ GlobalIndexPartitionCacheEntry *pde;
+ bool found;
+ Relation part_rel;
+
+ Assert(OidIsValid(relid));
+ Assert(pdir);
+ pde = hash_search(pdir->pdir_hash, &relid, HASH_FIND, &found);
+ if (found)
+ return pde;
+ else
+ {
+ pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
+ part_rel = relation_open(relid, AccessShareLock);
+ pde->relation = part_rel;
+ pde->heapfetch = table_index_fetch_begin(part_rel);
+ }
+
+ return pde;
+}
+
+/*
+ * globalindex_partition_entry_lookup - destory the cache
+ *
+ * This will destory the GlobalIndexPartitionCache and also deallocate index
+ * fetch for each cache entry whereever it was initialized.
+ */
+void
+globalindex_partition_cache_destroy(GlobalIndexPartitionCache pdir)
+{
+ HASH_SEQ_STATUS status;
+ GlobalIndexPartitionCacheEntry *pde;
+
+ hash_seq_init(&status, pdir->pdir_hash);
+ while ((pde = hash_seq_search(&status)) != NULL)
+ {
+ if (pde->heapfetch)
+ {
+ table_index_fetch_end(pde->heapfetch);
+ pde->heapfetch = NULL;
+ }
+
+ relation_close(pde->relation, NoLock);
+ }
+}
+
+/*
+ * globalindex_partition_entry_lookup - reset the cache
+ *
+ * This will reset the GlobalIndexPartitionCache and also reset the index
+ * fetch for each cache entry if it was initialized.
+ */
+static void
+globalindex_partition_cache_reset(GlobalIndexPartitionCache pdir)
+{
+ HASH_SEQ_STATUS status;
+ GlobalIndexPartitionCacheEntry *entry;
+
+ hash_seq_init(&status, pdir->pdir_hash);
+ while ((entry = hash_seq_search(&status)))
+ {
+ if (entry->heapfetch)
+ table_index_fetch_reset(entry->heapfetch);
+ }
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c3960784eb..e310ddcea6 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,7 +228,15 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool res;
- Assert(scan->heapRelation != NULL);
+ /*
+ * When working with global indexes, the scan's heap relation
+ * (scan->heapRelation) is not set beforehand. Instead, it's populated by
+ * the index scan interfaces, dynamically determined based on the TID being
+ * processed. This is because global index tuples explicitly carry the heap
+ * OID (along with the TID) to identify the originating heap relation.
+ */
+ Assert(RelationIsGlobalIndex(scan->indexRelation) ||
+ scan->heapRelation != NULL);
/* btree indexes are never lossy */
scan->xs_recheck = false;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 36544ecfd5..44841394df 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -35,13 +35,14 @@ static int _bt_binsrch_posting(BTScanInsert key, Page page,
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum, bool firstpage);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
- OffsetNumber offnum, IndexTuple itup);
+ OffsetNumber offnum, IndexTuple itup, Oid heapOid);
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, ItemPointer heapTid,
- IndexTuple itup);
+ IndexTuple itup, Oid heapOid);
static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum,
- ItemPointer heapTid, int tupleOffset);
+ ItemPointer heapTid, int tupleOffset,
+ Oid heapOid);
static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum,
@@ -1608,6 +1609,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
bool arrayKeys;
int itemIndex,
indnatts;
+ Oid heapOid;
/* save the page/buffer block number, along with its sibling links */
page = BufferGetPage(so->currPos.buf);
@@ -1718,6 +1720,27 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
+ /*
+ * For global index we also need to fetch the relation oid in order
+ * to know from which relation we need to fetch tuple.
+ */
+ if (RelationIsGlobalIndex(scan->indexRelation))
+ {
+ heapOid = BTreeTupleGetPartitionRelid(scan->indexRelation, itup);
+
+ /*
+ * If the partition is already detcahed then we will get an
+ * InvalidOid so ignore such tuples.
+ */
+ if (!OidIsValid(heapOid))
+ {
+ offnum = OffsetNumberNext(offnum);
+ continue;
+ }
+ }
+ else
+ heapOid = InvalidOid;
+
pstate.offnum = offnum;
passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
itup, indnatts);
@@ -1743,7 +1766,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
if (!BTreeTupleIsPosting(itup))
{
/* Remember it */
- _bt_saveitem(so, itemIndex, offnum, itup);
+ _bt_saveitem(so, itemIndex, offnum, itup, heapOid);
itemIndex++;
}
else
@@ -1757,14 +1780,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
tupleOffset =
_bt_setuppostingitems(so, itemIndex, offnum,
BTreeTupleGetPostingN(itup, 0),
- itup);
+ itup, heapOid);
itemIndex++;
/* Remember additional TIDs */
for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
{
_bt_savepostingitem(so, itemIndex, offnum,
BTreeTupleGetPostingN(itup, i),
- tupleOffset);
+ tupleOffset, heapOid);
itemIndex++;
}
}
@@ -1883,6 +1906,24 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
itup = (IndexTuple) PageGetItem(page, iid);
Assert(!BTreeTupleIsPivot(itup));
+ /*
+ * For global index we also need to fetch the partition id in order
+ * to know from which relation we need to fetch tuple. We might
+ * get an InvalidOid if the partition is already detcahed so ignore
+ * such tuples.
+ */
+ if (RelationIsGlobalIndex(scan->indexRelation))
+ {
+ heapOid = BTreeTupleGetPartitionRelid(scan->indexRelation, itup);
+ if (!OidIsValid(heapOid))
+ {
+ offnum = OffsetNumberNext(offnum);
+ continue;
+ }
+ }
+ else
+ heapOid = InvalidOid;
+
pstate.offnum = offnum;
if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
{
@@ -1931,7 +1972,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
{
/* Remember it */
itemIndex--;
- _bt_saveitem(so, itemIndex, offnum, itup);
+ _bt_saveitem(so, itemIndex, offnum, itup, heapOid);
}
else
{
@@ -1951,14 +1992,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
tupleOffset =
_bt_setuppostingitems(so, itemIndex, offnum,
BTreeTupleGetPostingN(itup, 0),
- itup);
+ itup, heapOid);
/* Remember additional TIDs */
for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
{
itemIndex--;
_bt_savepostingitem(so, itemIndex, offnum,
BTreeTupleGetPostingN(itup, i),
- tupleOffset);
+ tupleOffset, heapOid);
}
}
}
@@ -2002,12 +2043,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
/* Save an index item into so->currPos.items[itemIndex] */
static void
_bt_saveitem(BTScanOpaque so, int itemIndex,
- OffsetNumber offnum, IndexTuple itup)
+ OffsetNumber offnum, IndexTuple itup, Oid heapOid)
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
+ currItem->heapOid = heapOid;
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
@@ -2032,12 +2074,13 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
*/
static int
_bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
- ItemPointer heapTid, IndexTuple itup)
+ ItemPointer heapTid, IndexTuple itup, Oid heapOid)
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
Assert(BTreeTupleIsPosting(itup));
+ currItem->heapOid = heapOid;
currItem->heapTid = *heapTid;
currItem->indexOffset = offnum;
if (so->currTuples)
@@ -2070,10 +2113,11 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
*/
static inline void
_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
- ItemPointer heapTid, int tupleOffset)
+ ItemPointer heapTid, int tupleOffset, Oid heapOid)
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ currItem->heapOid = heapOid;
currItem->heapTid = *heapTid;
currItem->indexOffset = offnum;
@@ -2100,6 +2144,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Assert(so->currPos.itemIndex <= so->currPos.lastItem);
/* Return next item, per amgettuple contract */
+ /* For global index we must have a valid heap oid. */
+ Assert(!scan->xs_global_index || OidIsValid(currItem->heapOid));
+ scan->xs_heapoid = currItem->heapOid;
scan->xs_heaptid = currItem->heapTid;
if (so->currTuples)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 472a096206..48bd2066a1 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -44,10 +44,6 @@ static void get_partition_ancestors_worker(Relation inhRel, Oid relid,
*
* If the partition is in the process of being detached, an error is thrown,
* unless even_if_detached is passed as true.
- *
- * Note: Because this function assumes that the relation whose OID is passed
- * as an argument will have precisely one parent, it should only be called
- * when it is known that the relation is a partition.
*/
Oid
get_partition_parent(Oid relid, bool even_if_detached)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 7e2792ead7..0721135200 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1442,10 +1442,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
pname = sname = "Gather Merge";
break;
case T_IndexScan:
- pname = sname = "Index Scan";
+ if (get_rel_relkind(((IndexScan *) plan)->indexid) ==
+ RELKIND_GLOBAL_INDEX)
+ pname = sname = "Global Index Scan";
+ else
+ pname = sname = "Index Scan";
break;
case T_IndexOnlyScan:
- pname = sname = "Index Only Scan";
+ if (get_rel_relkind(((IndexScan *) plan)->indexid) ==
+ RELKIND_GLOBAL_INDEX)
+ pname = sname = "Global Index Only Scan";
+ else
+ pname = sname = "Index Only Scan";
break;
case T_BitmapIndexScan:
pname = sname = "Bitmap Index Scan";
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index f464cca950..f85962b88a 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -43,6 +43,7 @@
#include "storage/bufmgr.h"
#include "storage/predicate.h"
#include "utils/builtins.h"
+#include "utils/lsyscache.h"
#include "utils/rel.h"
@@ -124,6 +125,14 @@ IndexOnlyNext(IndexOnlyScanState *node)
CHECK_FOR_INTERRUPTS();
+ /*
+ * For global index we need to get the heapoid of the parittion
+ * relation from the scan descriptor stored by index scan in order to
+ * check the visibility map of that relation.
+ */
+ if (scandesc->xs_global_index)
+ global_indexscan_setup_partrel(scandesc);
+
/*
* We can skip the heap fetch if the TID references a heap page on
* which all tuples are known visible to everybody. In any case,
@@ -534,6 +543,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
TupleDesc tupDesc;
int indnkeyatts;
int namecount;
+ const TupleTableSlotOps *tts_cb;
/*
* create state structure
@@ -569,14 +579,25 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc,
&TTSOpsVirtual);
+ /*
+ * FIXME: Global index scans on partitioned tables require
+ * TTSOpsBufferHeapTuple, but partitioned tables normally get TTSOpsVirtual
+ * (no TableAM). We currently hack this by assuming partitions with global
+ * indexes are Heap AM. Proper TableAM integration for partitioned tables
+ * is needed for slot allocation.
+ */
+ if (get_rel_relkind(node->indexid) == RELKIND_GLOBAL_INDEX)
+ tts_cb = &TTSOpsBufferHeapTuple;
+ else
+ tts_cb = table_slot_callbacks(currentRelation);
+
/*
* We need another slot, in a format that's suitable for the table AM, for
* when we need to fetch a tuple from the table for rechecking visibility.
*/
indexstate->ioss_TableSlot =
ExecAllocTableSlot(&estate->es_tupleTable,
- RelationGetDescr(currentRelation),
- table_slot_callbacks(currentRelation));
+ RelationGetDescr(currentRelation), tts_cb);
/*
* Initialize result type and projection info. The node's targetlist will
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 7fcaa37fe6..6cd041330d 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -911,6 +911,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
IndexScanState *indexstate;
Relation currentRelation;
LOCKMODE lockmode;
+ const TupleTableSlotOps *tts_cb;
/*
* create state structure
@@ -935,12 +936,23 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
indexstate->ss.ss_currentRelation = currentRelation;
indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
+ /*
+ * FIXME: Global index scans on partitioned tables require
+ * TTSOpsBufferHeapTuple, but partitioned tables normally get TTSOpsVirtual
+ * (no TableAM). We currently hack this by assuming partitions with global
+ * indexes are Heap AM. Proper TableAM integration for partitioned tables
+ * is needed for slot allocation.
+ */
+ if (get_rel_relkind(node->indexid) == RELKIND_GLOBAL_INDEX)
+ tts_cb = &TTSOpsBufferHeapTuple;
+ else
+ tts_cb = table_slot_callbacks(currentRelation);
+
/*
* get the scan type from the relation descriptor.
*/
ExecInitScanTupleSlot(estate, &indexstate->ss,
- RelationGetDescr(currentRelation),
- table_slot_callbacks(currentRelation));
+ RelationGetDescr(currentRelation), tts_cb);
/*
* Initialize result type and projection.
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 6cc6966b06..230a98f221 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1211,6 +1211,12 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /*
+ * We need to check the index predicate for the parent relation, as the
+ * parent relation may have global index scan paths.
+ */
+ check_index_predicates(root, rel);
+
if (has_live_children)
{
/*
@@ -1303,6 +1309,12 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
/* Add paths to the append relation. */
add_paths_to_append_rel(root, rel, live_childrels);
+
+ /*
+ * Partiotioned relation may have global indexes so lets consider index
+ * scan paths.
+ */
+ create_index_paths(root, rel);
}
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 601354ea3e..8fef652d4a 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -21,6 +21,7 @@
#include "access/sysattr.h"
#include "catalog/pg_am.h"
#include "catalog/pg_amop.h"
+#include "catalog/pg_index_partitions.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_opfamily.h"
#include "catalog/pg_type.h"
@@ -246,6 +247,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
IndexClauseSet jclauseset;
IndexClauseSet eclauseset;
ListCell *lc;
+ bool ispartitioned = IS_PARTITIONED_REL(rel);
/* Skip the whole mess if no indexes */
if (rel->indexlist == NIL)
@@ -259,6 +261,22 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
{
IndexOptInfo *index = (IndexOptInfo *) lfirst(lc);
+ /*
+ * For partitioned relations, we can only consider global index scan
+ * paths. And for non partitioned relation ignore the indirect
+ * global indexes.
+ */
+ if ((ispartitioned && index->idxkind != INDEX_GLOBAL_DIRECT) ||
+ (!ispartitioned && index->idxkind != INDEX_LOCAL))
+ continue;
+
+ /*
+ * For non partitioned table we should not get the global index info.
+ * Check comments in get_relation_info() where we are adding
+ * IndexOptInfo nodes.
+ */
+ Assert(ispartitioned || index->idxkind != INDEX_GLOBAL_DIRECT);
+
/* Protect limited-size array in IndexClauseSets */
Assert(index->nkeycolumns <= INDEX_MAX_KEYS);
@@ -2228,6 +2246,7 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index)
{
bool result;
Bitmapset *attrs_used = NULL;
+ Bitmapset *rowidvar = NULL;
Bitmapset *index_canreturn_attrs = NULL;
ListCell *lc;
int i;
@@ -2248,6 +2267,21 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index)
*/
pull_varattnos((Node *) rel->reltarget->exprs, rel->relid, &attrs_used);
+ /*
+ * FIXME: Ugly hack to avoid global index only scan during update/delete.
+ * In normal case it is avoided because reltarget will have junkattribute
+ * which would not match with index_canreturn_attrs. But with global index
+ * we are creating this scan on parent table so we would have extra
+ * ROWID_VAR but that would not get caught while calling pull_varattnos
+ * with rel->relid so we are searching here with sepecific ROWID_VAR.
+ */
+ if (rel->nparts != 0)
+ {
+ pull_varattnos((Node *) rel->reltarget->exprs, ROWID_VAR, &rowidvar);
+ if (rowidvar != NULL)
+ return false;
+ }
+
/*
* Add all the attributes used by restriction clauses; but consider only
* those clauses not implied by the index predicate, since ones that are
@@ -2276,9 +2310,11 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index)
/*
* For the moment, we just ignore index expressions. It might be nice
- * to do something with them, later.
+ * to do something with them, later. For global index we also add
+ * an internal partition id attribute so just ignore that as we don't
+ * need to return that attribute from index.
*/
- if (attno == 0)
+ if (attno == 0 || attno == PartitionIdAttributeNumber)
continue;
if (index->canreturn[i])
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index 5467e094ca..922b938f0b 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -20,6 +20,7 @@
*/
#include "postgres.h"
+#include "catalog/pg_inherits.h"
#include "optimizer/appendinfo.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
@@ -28,7 +29,8 @@
#include "optimizer/paths.h"
#include "optimizer/placeholder.h"
#include "optimizer/planmain.h"
-
+#include "storage/lmgr.h"
+#include "storage/lockdefs.h"
/*
* query_planner
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 549aedcfa9..b63e9c47c1 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -22,6 +22,7 @@
#include "access/parallel.h"
#include "access/sysattr.h"
#include "access/table.h"
+#include "catalog/partition.h"
#include "catalog/pg_aggregate.h"
#include "catalog/pg_inherits.h"
#include "catalog/pg_proc.h"
@@ -58,6 +59,7 @@
#include "parser/parsetree.h"
#include "partitioning/partdesc.h"
#include "rewrite/rewriteManip.h"
+#include "storage/lmgr.h"
#include "utils/backend_status.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
@@ -267,7 +269,7 @@ static bool group_by_has_partkey(RelOptInfo *input_rel,
static int common_prefix_cmp(const void *a, const void *b);
static List *generate_setop_child_grouplist(SetOperationStmt *op,
List *targetlist);
-
+static void lock_additional_rel(PlannerInfo *root);
/*****************************************************************************
*
@@ -581,6 +583,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
result->utilityStmt = parse->utilityStmt;
result->stmt_location = parse->stmt_location;
result->stmt_len = parse->stmt_len;
+ result->lockrelOids = glob->lockRelOids;
result->jitFlags = PGJIT_NONE;
if (jit_enabled && jit_above_cost >= 0 &&
@@ -1176,6 +1179,13 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root,
*/
SS_identify_outer_params(root);
+ /*
+ * Prepare a list of additional relation OIDs to be locked if there is any
+ * global index on the result relation. Also lock those OIDs, for more
+ * details refer function header comments.
+ */
+ lock_additional_rel(root);
+
/*
* If any initPlans were created in this query level, adjust the surviving
* Paths' costs and parallel-safety flags to account for them. The
@@ -7748,12 +7758,13 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
bool rel_is_partitioned = IS_PARTITIONED_REL(rel);
PathTarget *scanjoin_target;
ListCell *lc;
+ List *global_index_path_list = NIL;
/* This recurses, so be paranoid. */
check_stack_depth();
/*
- * If the rel is partitioned, we want to drop its existing paths and
+ * If the rel is partitioned, we want to drop its existing append paths and
* generate new ones. This function would still be correct if we kept the
* existing paths: we'd modify them to generate the correct target above
* the partitioning Append, and then they'd compete on cost with paths
@@ -7770,9 +7781,57 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
* stanza. Hence, zap the main pathlist here, then allow
* generate_useful_gather_paths to add path(s) to the main list, and
* finally zap the partial pathlist.
+ *
+ * Note: All the partitioned rel paths which are build by appending child
+ * rel paths will be rebuilt again so we need to preserve the global index
+ * paths which are directly created on the partitioned relation.
*/
if (rel_is_partitioned)
+ {
+ List *newtarget = NIL;
+ PathTarget *index_scanjoin_target;
+
+ /*
+ * Preprocess the scanjoin_targets and replace ROWID_VAR with the
+ * partitioned rel's varno, TODO - explain the reasoning here.
+ */
+ foreach(lc, scanjoin_targets)
+ {
+ PathTarget *target = lfirst_node(PathTarget, lc);
+
+ target = copy_pathtarget(target);
+ target->exprs = (List *)
+ adjust_appendrel_rowid_vars(root, (Node *) target->exprs,
+ rel->relid);
+ newtarget = lappend(newtarget, target);
+ }
+ /* Extract SRF-free scan/join target. */
+ index_scanjoin_target = linitial_node(PathTarget, newtarget);
+
+ /*
+ * As explained in above comments, skip all paths other than the
+ * global index paths as other paths will be build again. So process
+ * the global index paths and apply the index_scanjoin_target to them.
+ */
+ foreach(lc, rel->pathlist)
+ {
+ Path *path = (Path *) lfirst(lc);
+ Path *newpath;
+
+ if (nodeTag(path) != T_IndexPath)
+ continue;
+
+ newpath = (Path *) create_projection_path(root, rel, path,
+ index_scanjoin_target);
+ global_index_path_list = lappend(global_index_path_list, newpath);
+ }
+
+ /*
+ * For now set the rel->pathlist to NIL and once we have regenerated
+ * the append paths add the other paths back to the list.
+ */
rel->pathlist = NIL;
+ }
/*
* If the scan/join target is not parallel-safe, partial paths cannot
@@ -7935,6 +7994,9 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
/* Build new paths for this relation by appending child paths. */
add_paths_to_append_rel(root, rel, live_children);
+
+ if (global_index_path_list)
+ rel->pathlist = list_concat(rel->pathlist, global_index_path_list);
}
/*
@@ -8248,3 +8310,76 @@ generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist)
return grouplist;
}
+
+
+/*
+ * lock_additional_rel
+ * Lock additional relations to be locked in presence of a global index and
+ * also add those Oids to PlannerGlobal so that
+ *
+ * During DML operations on tables with global indexes, it's necessary to
+ * lock the entire partition tree up to the partitioned relation that holds
+ * the global index.
+ */
+static void
+lock_additional_rel(PlannerInfo *root)
+{
+ Query *parse = root->parse;
+ RelOptInfo *rel;
+ ListCell *lc;
+ List *lockreloids = NIL;
+
+ /* Nothing to do if there is no result relation. */
+ if (parse->resultRelation <= 0)
+ return;
+
+ /*
+ * Fetch the RelOptInfo of the result relation. If we haven't built it
+ * already then do it now.
+ */
+ rel = find_base_rel_noerr(root, parse->resultRelation);
+ if (rel == NULL)
+ {
+ RangeTblEntry *rte = root->simple_rte_array[parse->resultRelation];
+
+ /*
+ * If we don't have global index on the result relation then we don't
+ * need to do anything.
+ */
+ if (!get_rel_has_globalindex(rte->relid))
+ return;
+
+ rel = build_simple_rel(root, parse->resultRelation, NULL);
+ }
+
+ /*
+ * Loop through all the indexes of the result relation and if it is a
+ * global index then lock all the inheritors under the relation on which
+ * this global index is created. Also store the list of all the OIDs
+ * in PlannerGlobal.
+ */
+ foreach(lc, rel->indexlist)
+ {
+ IndexOptInfo *index = (IndexOptInfo *) lfirst(lc);
+ List *childrel = NIL;
+
+ if (index->idxkind == INDEX_LOCAL)
+ continue;
+
+ if (list_member_oid(lockreloids, index->indrelid))
+ continue;
+
+ /*
+ * Acquire lock on top level parent on which the global index is
+ * created and also lock all its inheritors.
+ */
+ LockRelationOid(index->indrelid, RowExclusiveLock);
+ lockreloids = lappend_oid(lockreloids, index->indrelid);
+ childrel = find_all_inheritors(index->indrelid, RowExclusiveLock,
+ NULL);
+ lockreloids = list_concat(lockreloids, childrel);
+ }
+
+ root->glob->lockRelOids =
+ list_concat_unique_oid(root->glob->lockRelOids, lockreloids);
+}
diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c
index 5b3dc0d865..2ad52cb497 100644
--- a/src/backend/optimizer/util/appendinfo.c
+++ b/src/backend/optimizer/util/appendinfo.c
@@ -32,6 +32,7 @@ typedef struct
{
PlannerInfo *root;
int nappinfos;
+ int varno;
AppendRelInfo **appinfos;
} adjust_appendrel_attrs_context;
@@ -41,7 +42,8 @@ static void make_inh_translation_list(Relation oldrelation,
AppendRelInfo *appinfo);
static Node *adjust_appendrel_attrs_mutator(Node *node,
adjust_appendrel_attrs_context *context);
-
+static Node *adjust_appendrel_rowid_vars_mutator(Node *node,
+ adjust_appendrel_attrs_context *context);
/*
* make_append_rel_info
@@ -529,6 +531,62 @@ adjust_appendrel_attrs_mutator(Node *node,
return expression_tree_mutator(node, adjust_appendrel_attrs_mutator, context);
}
+/*
+ * Replace ROWID_VAR with the varno.
+ *
+ * This is simmilar to the adjust_appendrel_attrs(), except here instead of
+ * preparing the scantarget for the appendrel we are preparing for the
+ * partitioned rel, so varno of the partitioned rel is passed as input and we
+ * need to replcae the ROWID_VAR with the input varno.
+ */
+Node *
+adjust_appendrel_rowid_vars(PlannerInfo *root, Node *node, int varno)
+{
+ adjust_appendrel_attrs_context context;
+
+ context.root = root;
+ context.nappinfos = 0;
+ context.varno = varno;
+
+ /* Should never be translating a Query tree. */
+ Assert(node == NULL || !IsA(node, Query));
+
+ return adjust_appendrel_rowid_vars_mutator(node, &context);
+}
+
+static Node *
+adjust_appendrel_rowid_vars_mutator(Node *node,
+ adjust_appendrel_attrs_context *context)
+{
+ if (node == NULL)
+ return NULL;
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) copyObject(node);
+
+ if (var->varno == ROWID_VAR)
+ {
+ RowIdentityVarInfo *ridinfo = (RowIdentityVarInfo *)
+ list_nth(context->root->row_identity_vars, var->varattno - 1);
+
+ /* Substitute the Var given in the RowIdentityVarInfo */
+ var = copyObject(ridinfo->rowidvar);
+
+ /* Replace the ROWID_VAR with the varno of the partitioned rel. */
+ var->varno = context->varno;
+ /* identity vars shouldn't have nulling rels */
+ Assert(var->varnullingrels == NULL);
+ /* varnosyn in the RowIdentityVarInfo is probably wrong */
+ var->varnosyn = 0;
+ var->varattnosyn = 0;
+ }
+
+ return (Node *) var;
+ }
+ return expression_tree_mutator(node, adjust_appendrel_rowid_vars_mutator,
+ (void *) context);
+}
+
/*
* adjust_appendrel_attrs_multilevel
* Apply Var translations from an appendrel parent down to a child.
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index c716f9a6fe..576a7f97f4 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -35,6 +35,7 @@
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
+#include "nodes/pathnodes.h"
#include "nodes/supportnodes.h"
#include "optimizer/cost.h"
#include "optimizer/optimizer.h"
@@ -268,15 +269,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
continue;
}
- /*
- * TODO: Global index scan paths are not yet supported.
- */
- if (RelationIsGlobalIndex(indexRelation))
- {
- index_close(indexRelation, NoLock);
- continue;
- }
-
/*
* If the index is valid, but cannot yet be used, ignore it; but
* mark the plan we are generating as transient. See
@@ -293,7 +285,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info = makeNode(IndexOptInfo);
+ /* Set a flag to indicate this is a global index. */
+ if (RelationIsGlobalIndex(indexRelation))
+ info->idxkind = (index->indrelid == relationObjectId) ?
+ INDEX_GLOBAL_DIRECT : INDEX_GLOBAL_INDIRECT;
+
info->indexoid = index->indexrelid;
+ info->indrelid = index->indrelid;
info->reltablespace =
RelationGetForm(indexRelation)->reltablespace;
info->rel = rel;
@@ -333,15 +331,28 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->amoptionalkey = amroutine->amoptionalkey;
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
- info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
- info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
- relation->rd_tableam->scan_bitmap_next_tuple != NULL;
info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
amroutine->amrestrpos != NULL);
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);
+ /*
+ * TODO: Currently parallel and bitmap scans are not supported
+ * for the global indexes.
+ */
+ if (info->idxkind != INDEX_LOCAL)
+ {
+ info->amcanparallel = false;
+ info->amhasgetbitmap = false;
+ }
+ else
+ {
+ info->amcanparallel = amroutine->amcanparallel;
+ info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+ relation->rd_tableam->scan_bitmap_next_tuple != NULL;
+ }
+
/* Fetch index opclass options */
info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
@@ -1932,7 +1943,13 @@ build_index_tlist(PlannerInfo *root, IndexOptInfo *index,
/* simple column */
const FormData_pg_attribute *att_tup;
- if (indexkey < 0)
+ /*
+ * If the attribute number is PartitionIdAttributeNumber then
+ * directly assign to the predefined partitionid_attr constant.
+ */
+ if (indexkey == PartitionIdAttributeNumber)
+ att_tup = &partitionid_attr;
+ else if (indexkey < 0)
att_tup = SystemAttributeDefinition(indexkey);
else
att_tup = TupleDescAttr(heapRelation->rd_att, indexkey - 1);
diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c
index 8065237a18..3fd7bc949f 100644
--- a/src/backend/optimizer/util/var.c
+++ b/src/backend/optimizer/util/var.c
@@ -21,6 +21,7 @@
#include "postgres.h"
#include "access/sysattr.h"
+#include "catalog/pg_index_partitions.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index d354f44e66..1dc7fd2ae4 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -4266,6 +4266,7 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd)
RelationGetRelationName(parentRel))));
break;
case RELKIND_INDEX:
+ case RELKIND_GLOBAL_INDEX:
/* the index must be partitioned */
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index ce6a626eba..7d3082a54b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6500,6 +6500,8 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
/* Ignore non-ordering indexes */
if (index->sortopfamily == NULL)
continue;
+ if (index->idxkind != INDEX_LOCAL)
+ continue;
/*
* Ignore partial indexes --- we only want stats that cover the entire
@@ -6720,6 +6722,8 @@ get_actual_variable_endpoint(Relation heapRel,
InitNonVacuumableSnapshot(SnapshotNonVacuumable,
GlobalVisTestFor(heapRel));
+ Assert(!RelationIsGlobalIndex(indexRel));
+
index_scan = index_beginscan(heapRel, indexRel,
&SnapshotNonVacuumable, NULL,
1, 0);
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 89a1c79e98..412628872c 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -1928,6 +1928,21 @@ AcquireExecutorLocks(List *stmt_list, bool acquire)
else
UnlockRelationOid(rte->relid, rte->rellockmode);
}
+
+ /*
+ * Loop through the lockrelOids derived based on the result relations
+ * and acquire lock on all the relation. We may store the lockmode as
+ * well along with the oid but we can dirtectly use RowExclusiveLock
+ * because these are derived from result relations and result relations
+ * are locked in this mode.
+ */
+ foreach_oid(relid, plannedstmt->lockrelOids)
+ {
+ if (acquire)
+ LockRelationOid(relid, RowExclusiveLock);
+ else
+ UnlockRelationOid(relid, RowExclusiveLock);
+ }
}
}
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 778ec2815c..8624ece5d7 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1923,7 +1923,8 @@ describeOneTableDetails(const char *schemaname,
attgenerated_col = cols++;
}
if (tableinfo.relkind == RELKIND_INDEX ||
- tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
+ tableinfo.relkind == RELKIND_PARTITIONED_INDEX ||
+ tableinfo.relkind == RELKIND_GLOBAL_INDEX)
{
if (pset.sversion >= 110000)
{
@@ -2308,7 +2309,8 @@ describeOneTableDetails(const char *schemaname,
}
if (tableinfo.relkind == RELKIND_INDEX ||
- tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
+ tableinfo.relkind == RELKIND_PARTITIONED_INDEX ||
+ tableinfo.relkind == RELKIND_GLOBAL_INDEX)
{
/* Footer information about an index */
PGresult *result;
@@ -2412,7 +2414,8 @@ describeOneTableDetails(const char *schemaname,
/*
* If it's a partitioned index, we'll print the tablespace below
*/
- if (tableinfo.relkind == RELKIND_INDEX)
+ if (tableinfo.relkind == RELKIND_INDEX ||
+ tableinfo.relkind == RELKIND_GLOBAL_INDEX)
add_tablespace_footer(&cont, tableinfo.relkind,
tableinfo.tablespace, true);
}
@@ -3666,6 +3669,7 @@ add_tablespace_footer(printTableContent *const cont, char relkind,
relkind == RELKIND_INDEX ||
relkind == RELKIND_PARTITIONED_TABLE ||
relkind == RELKIND_PARTITIONED_INDEX ||
+ relkind == RELKIND_GLOBAL_INDEX ||
relkind == RELKIND_TOASTVALUE)
{
/*
@@ -4055,6 +4059,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
" WHEN " CppAsString2(RELKIND_FOREIGN_TABLE) " THEN '%s'"
" WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'"
" WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'"
+ " WHEN " CppAsString2(RELKIND_GLOBAL_INDEX) " THEN '%s'"
" END as \"%s\",\n"
" pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"",
gettext_noop("Schema"),
@@ -4068,6 +4073,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
gettext_noop("foreign table"),
gettext_noop("partitioned table"),
gettext_noop("partitioned index"),
+ gettext_noop("global index"),
gettext_noop("Type"),
gettext_noop("Owner"));
cols_so_far = 4;
@@ -4148,7 +4154,8 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
appendPQExpBufferStr(&buf, CppAsString2(RELKIND_MATVIEW) ",");
if (showIndexes)
appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) ","
- CppAsString2(RELKIND_PARTITIONED_INDEX) ",");
+ CppAsString2(RELKIND_PARTITIONED_INDEX) ","
+ CppAsString2(RELKIND_GLOBAL_INDEX) ",");
if (showSeq)
appendPQExpBufferStr(&buf, CppAsString2(RELKIND_SEQUENCE) ",");
if (showSystem || pattern)
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 5b2ab181b5..ec032ceda6 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -15,6 +15,8 @@
#define GENAM_H
#include "access/htup.h"
+#include "access/itup.h"
+#include "access/relscan.h"
#include "access/sdir.h"
#include "access/skey.h"
#include "nodes/tidbitmap.h"
@@ -265,6 +267,10 @@ extern SysScanDesc systable_beginscan_ordered(Relation heapRelation,
extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
ScanDirection direction);
extern void systable_endscan_ordered(SysScanDesc sysscan);
+extern Relation globalindex_partition_rel_lookup(GlobalIndexPartitionCache pdir, Oid relid);
+extern void globalindex_partition_cache_destroy(GlobalIndexPartitionCache pdir);
+extern GlobalIndexPartitionCache create_globalindex_partition_cache(MemoryContext mcxt);
+extern void global_indexscan_setup_partrel(IndexScanDesc scan);
extern void systable_inplace_update_begin(Relation relation,
Oid indexId,
bool indexOK,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index cf7ddb0131..435a74749a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1009,6 +1009,9 @@ typedef BTVacuumPostingData *BTVacuumPosting;
typedef struct BTScanPosItem /* what we remember about each match */
{
+ Oid heapOid; /* Oid of the partition relation , only valid for
+ global indexes because global index can hold tuples
+ from multiple partitions */
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b5e0fb386c..8d0925d504 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -125,6 +125,8 @@ typedef struct IndexFetchTableData
struct IndexScanInstrumentation;
+typedef struct GlobalIndexPartitionCacheData *GlobalIndexPartitionCache;
+
/*
* We use the same IndexScanDescData structure for both amgettuple-based
* and amgetbitmap-based index scans. Some fields are only relevant in
@@ -168,7 +170,9 @@ typedef struct IndexScanDescData
struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */
HeapTuple xs_hitup; /* index data returned by AM, as HeapTuple */
struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */
-
+ Oid xs_heapoid; /* Oid of the partition relation , only valid
+ for global indexes because global index can
+ hold tuples from multiple partitions */
ItemPointerData xs_heaptid; /* result */
bool xs_heap_continue; /* T if must keep walking, potential
* further results */
@@ -189,6 +193,8 @@ typedef struct IndexScanDescData
/* parallel index scan information, in shared memory */
struct ParallelIndexScanDescData *parallel_scan;
+ bool xs_global_index;
+ GlobalIndexPartitionCache xs_global_index_cache;
} IndexScanDescData;
/* Generic structure for parallel scans */
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 6567759595..fbae020a4c 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -153,6 +153,9 @@ typedef struct PlannerGlobal
/* type OIDs for PARAM_EXEC Params */
List *paramExecTypes;
+ /* additional relation OIDs to be locked for global index */
+ List *lockRelOids;
+
/* highest PlaceHolderVar ID assigned */
Index lastPHId;
@@ -856,6 +859,13 @@ typedef enum RelOptKind
RELOPT_OTHER_UPPER_REL,
} RelOptKind;
+typedef enum IndexKind
+{
+ INDEX_LOCAL,
+ INDEX_GLOBAL_DIRECT,
+ INDEX_GLOBAL_INDIRECT
+} IndexKind;
+
/*
* Is the given relation a simple relation i.e a base or "other" member
* relation?
@@ -1143,6 +1153,14 @@ struct IndexOptInfo
Oid indexoid;
/* tablespace of index (not table) */
Oid reltablespace;
+
+ /*
+ * OID of the relation on which the index is created, for normal index we
+ * have RelOptInfo reference to identify that relation but for global index
+ * we need to explicitely need it as global index might have defined on
+ * some upper level parent relations.
+ */
+ Oid indrelid;
/* back-link to index's table; don't print, else infinite recursion */
RelOptInfo *rel pg_node_attr(read_write_ignore);
@@ -1206,6 +1224,9 @@ struct IndexOptInfo
*/
List *indrestrictinfo;
+ /* whether the index is local or direct global or indirect global */
+ IndexKind idxkind;
+
/* true if index predicate matches query */
bool predOK;
/* true if a unique index */
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4f59e30d62..c07a8f14fc 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -122,6 +122,9 @@ typedef struct PlannedStmt
/* OIDs of relations the plan depends on */
List *relationOids;
+ /* OIDs of relation to be locked */
+ List *lockrelOids;
+
/* other dependencies, as PlanInvalItems */
List *invalItems;
diff --git a/src/include/optimizer/appendinfo.h b/src/include/optimizer/appendinfo.h
index d06f93b726..f8fd66c657 100644
--- a/src/include/optimizer/appendinfo.h
+++ b/src/include/optimizer/appendinfo.h
@@ -22,6 +22,8 @@ extern AppendRelInfo *make_append_rel_info(Relation parentrel,
Index parentRTindex, Index childRTindex);
extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node,
int nappinfos, AppendRelInfo **appinfos);
+extern Node *adjust_appendrel_rowid_vars(PlannerInfo *root, Node *node,
+ int varno);
extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
RelOptInfo *childrel,
RelOptInfo *parentrel);
--
2.49.0