v1-0001-Catalog-for-globalindexid-reloid-to-PartitionId-t.patch

application/octet-stream

Filename: v1-0001-Catalog-for-globalindexid-reloid-to-PartitionId-t.patch
Type: application/octet-stream
Part: 0
Message: Re: Proposal: Global Index for PostgreSQL

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v1-0001
Subject: Catalog for (globalindexid, reloid) to PartitionId to mapping
File+
src/backend/catalog/Makefile 2 2
src/backend/catalog/pg_index_partitions.c 342 0
src/include/catalog/Makefile 2 1
src/include/catalog/pg_index_partitions.h 84 0
src/include/c.h 16 0
src/include/postgres.h 19 0
src/include/utils/rel.h 7 0
src/test/regress/expected/oidjoins.out 2 0
From 0c48cf2daad6fba92623c4b9b7be60a60e1e3fab Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Tue, 23 Jul 2024 15:46:38 +0530
Subject: [PATCH v1 1/4] Catalog for (globalindexid, reloid) to PartitionId to
 mapping

This is a base patch required for global indexes.  Basically, global
indexes stores tuple from multiple partitions of a partitioned relation
so TIDs alone are insufficient for uniquely identifying tuples. We might
consider storing the relation OID along with TID, as the combination
of relation OID and TID is the most straightforward way to uniquely
identify a heap tuple.

However, this approach has its drawbacks. For instance, if a partition
is detached, the global index would not know how to ignore the tuples
from that detached partition unless it cleans out all tuples from that
partition in the global index.

Therefore, we need an identifier for each partition that is only valid
while the partition is attached and becomes invalid once the partition
is detached. We call this identifier a partition ID. However, when
accessing tuples from the index, we still need to convert the partition
ID to the heap OID at some point.

One might assume a 1-to-1 mapping between partition IDs and relation
OIDs, but this isn't feasible. In a multi-level partition hierarchy,
detaching a partitioned table from a higher-level partitioned table
invalidates the underlying leaf relation's partition ID for the global
indexes of the parent from which it was detached. However, that partition
ID should remain valid for the global indexes prersent at the lower level
partitioned tables where the leaf partition is still attached. Therefore,
for simplicity, we maintain a partition ID for each global index and leaf
relation OID pair. This way, detaching a partition only requires invalidating
the specific global index and partition ID combinations from which the leaf
partition is being detached.

This patch provides a catalog for storing these mappings and a cache for
faster access. It also includes mechanisms for allocating partition IDs
and managing insertions and deletions in the catalog.
---
 src/backend/catalog/Makefile              |   4 +-
 src/backend/catalog/pg_index_partitions.c | 342 ++++++++++++++++++++++
 src/include/c.h                           |  16 +
 src/include/catalog/Makefile              |   3 +-
 src/include/catalog/pg_index_partitions.h |  84 ++++++
 src/include/postgres.h                    |  19 ++
 src/include/utils/rel.h                   |   7 +
 src/test/regress/expected/oidjoins.out    |   2 +
 8 files changed, 474 insertions(+), 3 deletions(-)
 create mode 100644 src/backend/catalog/pg_index_partitions.c
 create mode 100644 src/include/catalog/pg_index_partitions.h

diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index c090094ed0..275910eda6 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -46,8 +46,8 @@ OBJS = \
 	pg_subscription.o \
 	pg_type.o \
 	storage.o \
-	toasting.o
-
+	toasting.o \
+	pg_index_partitions.o
 include $(top_srcdir)/src/backend/common.mk
 
 .PHONY: install-data
diff --git a/src/backend/catalog/pg_index_partitions.c b/src/backend/catalog/pg_index_partitions.c
new file mode 100644
index 0000000000..e637feb453
--- /dev/null
+++ b/src/backend/catalog/pg_index_partitions.c
@@ -0,0 +1,342 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_index_partitions.c
+ *
+ *	  routines to support manipulation of the pg_index_partitions relation and
+ *	  also provide cache over this relation for faster mapping from partition
+ *	  id to reloid for a global index.
+ *
+ * Notes(why we need pg_index_partitions relation):
+ *
+ * This mapping is required for global indexes.  Basically, global indexes
+ * stores tuple from multiple partitions of a partitioned relation so TIDs
+ * alone are insufficient for uniquely identifying tuples. We might consider
+ * storing the relation OID along with TID, as the combination of relation OID
+ * and TID is the most straightforward way to uniquely identify a heap tuple.
+ *
+ * However, this approach has its drawbacks. For instance, if a partition is
+ * detached, the global index would not know how to ignore the tuples from that
+ * detached partition unless it cleans out all tuples from that partition in
+ * the global index.
+ *
+ * Therefore, we need an identifier for each partition that is only valid while
+ * the partition is attached and becomes invalid once the partition is
+ * detached. We call this identifier a partition ID. However, when accessing
+ * tuples from the index, we still need to convert the partition ID to the heap
+ * OID at some point.  One might assume a 1-to-1 mapping between partition IDs
+ * and relation OIDs, but this isn't feasible. In a multi-level partition
+ * hierarchy, detaching a partitioned table from a higher-level partitioned
+ * table invalidates the underlying leaf relation's partition ID for the global
+ * indexes of the parent from which it was detached. However, that partition
+ * ID should remain valid for the global indexes prersent at the lower level
+ * partitioned tables where the leaf partition is still attached. Therefore,
+ * for simplicity, we maintain a partition ID for each global index and leaf
+ * relation OID pair. This way, detaching a partition only requires
+ * invalidating the specific global index and partition ID combinations from
+ * which the leaf partition is being detached.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/catalog/pg_index_partitions.c
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/stratnum.h"
+#include "access/table.h"
+#include "catalog/indexing.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_index_partitions.h"
+#include "partitioning/partdesc.h"
+#include "utils/fmgroids.h"
+#include "utils/inval.h"
+#include "utils/rel.h"
+
+/*
+ * InsertIndexPartitionEntry - Insert parition id to reloid mapping
+ *
+ * Insert (indexoid, partid) to reloid mapping into pg_index_partitions table.
+ */
+void
+InsertIndexPartitionEntry(Relation irel, Oid reloid, PartitionId partid)
+{
+	Datum		values[Natts_pg_index_partitions];
+	bool		nulls[Natts_pg_index_partitions];
+	HeapTuple	tuple;
+	Relation	rel;
+	Oid			indexoid = RelationGetRelid(irel);
+
+	rel = table_open(IndexPartitionsRelationId, RowExclusiveLock);
+
+	/* Make the pg_index_partitions entry. */
+	values[Anum_pg_index_partitions_indexoid - 1] = ObjectIdGetDatum(indexoid);
+	values[Anum_pg_index_partitions_reloid - 1] = ObjectIdGetDatum(reloid);
+	values[Anum_pg_index_partitions_partid - 1] = PartitionIdGetDatum(partid);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls);
+
+	CatalogTupleInsert(rel, tuple);
+
+	heap_freetuple(tuple);
+
+	table_close(rel, RowExclusiveLock);
+}
+
+/*
+ * DeleteIndexPartitionEntries - Delete all index partition entries.
+ *
+ * This will delete all the entires for given global index id from
+ * pg_index_partitions table.  This should only be called when global index
+ * is being dropped.
+ */
+void
+DeleteIndexPartitionEntries(Oid indrelid)
+{
+	Relation	catalogRelation;
+	ScanKeyData key;
+	SysScanDesc scan;
+	HeapTuple	tuple;
+
+	/* Find pg_index_partitions entries by indrelid. */
+	catalogRelation = table_open(IndexPartitionsRelationId, RowExclusiveLock);
+	ScanKeyInit(&key,
+				Anum_pg_index_partitions_indexoid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(indrelid));
+
+	scan = systable_beginscan(catalogRelation, IndexPartitionsIndexId, true,
+							  NULL, 1, &key);
+	while (HeapTupleIsValid(tuple = systable_getnext(scan)))
+		CatalogTupleDelete(catalogRelation, &tuple->t_self);
+
+	/* Done */
+	systable_endscan(scan);
+	table_close(catalogRelation, RowExclusiveLock);
+}
+
+/*
+ * BuildIndexPartitionInfo - Cache for parittion id to reloid mapping
+ *
+ * Build a cache for faster access to the mappping from partition id to the
+ * relation oid.  For more detail on this mapping refer to the comments in
+ * pg_index_partition.h and also atop PartitionId declaration in c.h.
+ */
+void
+BuildIndexPartitionInfo(Relation relation, MemoryContext context)
+{
+	SysScanDesc scan;
+	ScanKeyData key;
+	HeapTuple	tuple;
+	Relation	rel;
+	PartitionId	maxpartid = InvalidPartitionId;
+	IndexPartitionInfo	map;
+	MemoryContext oldcontext;
+	HASHCTL		ctl;
+
+	/*
+	 * Open pg_index_partition table for getting the partition id to reloid
+	 * mapping for the input index relation.
+	 */
+	rel = table_open(IndexPartitionsRelationId, AccessShareLock);
+
+	oldcontext = MemoryContextSwitchTo(context);
+	map = (IndexPartitionInfoData *) palloc0(sizeof(IndexPartitionInfoData));
+	map->context = context;
+
+	/* Make a new hash table for the cache */
+	ctl.keysize = sizeof(Oid);
+	ctl.entrysize = sizeof(IndexPartitionInfoEntry);
+	ctl.hcxt = context;
+
+	map->pdir_hash = hash_create("index partition directory", 256, &ctl,
+								  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	MemoryContextSwitchTo(oldcontext);
+
+	ScanKeyInit(&key,
+				Anum_pg_index_partitions_indexoid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(RelationGetRelid(relation)));
+
+	scan = systable_beginscan(rel, IndexPartitionsIndexId, true,
+							  NULL, 1, &key);
+
+	while ((tuple = systable_getnext(scan)) != NULL)
+	{
+		Form_pg_index_partitions form = (Form_pg_index_partitions) GETSTRUCT(tuple);
+		IndexPartitionInfoEntry *entry;
+		bool		found;
+
+		/*
+		 * We need to consider the partition id of the detached partitioned as
+		 * well while computing the maxpartid so that we do not repeat the
+		 * value.
+		 */
+		if (form->partid > maxpartid)
+			maxpartid = form->partid;
+
+		if (!OidIsValid(form->reloid))
+			continue;
+
+		entry = hash_search(map->pdir_hash, &form->partid, HASH_ENTER, &found);
+		Assert(!found);
+		entry->reloid = form->reloid;
+	}
+
+	map->max_partid = maxpartid;
+	relation->rd_indexpartinfo = map;
+	systable_endscan(scan);
+
+	table_close(rel, AccessShareLock);
+}
+
+/*
+ * IndexGetRelationPartitionId - Get partition id for the reloid
+ *
+ * Get the partition ID for the given partition relation OID for the specified
+ * global index relation.
+ */
+PartitionId
+IndexGetRelationPartitionId(Relation irel, Oid reloid)
+{
+	IndexPartitionInfo	map;
+	HASH_SEQ_STATUS		hash_seq;
+	PartitionId			partid = InvalidPartitionId;
+	IndexPartitionInfoEntry *entry;
+
+	if (irel->rd_indexpartinfo == NULL)
+		BuildIndexPartitionInfo(irel, CurrentMemoryContext);
+
+	map = irel->rd_indexpartinfo;
+
+	hash_seq_init(&hash_seq, map->pdir_hash);
+
+	while ((entry = hash_seq_search(&hash_seq)) != NULL)
+	{
+		if (entry->reloid == reloid)
+		{
+			partid = entry->partid;
+			hash_seq_term(&hash_seq);
+			break;
+		}
+	}
+
+	return partid;
+}
+
+/*
+ * IndexGetPartitionReloid - Get relation oid for the paritionid
+ *
+ * Get the relation OID for the given partition ID for the specified global
+ * index relation.
+ */
+Oid
+IndexGetPartitionReloid(Relation irel, PartitionId partid)
+{
+	IndexPartitionInfo	map = irel->rd_indexpartinfo;
+	IndexPartitionInfoEntry *entry;
+	bool		found;
+
+	entry = hash_search(map->pdir_hash, &partid, HASH_FIND, &found);
+	if (!found)
+		return InvalidOid;
+
+	return entry->reloid;
+}
+
+/*
+ * InvalidateIndexPartitionEntries - Invalidate pg_index_partitions entries
+ *
+ * Set reloid as Invalid in pg_index_partitions entries with respect to the
+ * given reloid.  If a valid global indexoids list is given then only
+ * invalidate the reloid entires which are related to the input global index
+ * oids.
+ */
+void
+InvalidateIndexPartitionEntries(List *reloids, Oid indexoid)
+{
+	Relation	catalogRelation;
+	SysScanDesc scan;
+	ScanKeyData key;
+	HeapTuple	tuple;
+
+	/*
+	 * Find pg_inherits entries by inhparent.  (We need to scan them all in
+	 * order to verify that no other partition is pending detach.)
+	 */
+	catalogRelation = table_open(IndexPartitionsRelationId, RowExclusiveLock);
+
+	ScanKeyInit(&key,
+				Anum_pg_index_partitions_indexoid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(indexoid));
+
+	scan = systable_beginscan(catalogRelation, IndexPartitionsIndexId, true,
+							  NULL, 1, &key);
+
+	while ((tuple = systable_getnext(scan)) != NULL)
+	{
+		Form_pg_index_partitions form = (Form_pg_index_partitions) GETSTRUCT(tuple);
+		HeapTuple	newtup;
+
+		if (!list_member_oid(reloids, form->reloid))
+			continue;
+
+		newtup = heap_copytuple(tuple);
+		((Form_pg_index_partitions) GETSTRUCT(newtup))->reloid = InvalidOid;
+
+		CatalogTupleUpdate(catalogRelation,
+						   &tuple->t_self,
+						   newtup);
+		heap_freetuple(newtup);
+	}
+
+	/* Done */
+	systable_endscan(scan);
+	table_close(catalogRelation, RowExclusiveLock);
+}
+
+/*
+ * IndexGetNextPartitionID - Get the next partition ID of the global index
+ *
+ * Obtain the next partition ID to be allocated for the specified global index
+ * relation. Also update this value in the cache for the next allocation.
+ */
+PartitionId
+IndexGetNextPartitionID(Relation irel)
+{
+	PartitionId partid;
+
+	/*
+	 * If the cache is not already build then do it first so that we know what
+	 * is the maximum partition ID value and then we can generate the next
+	 * value.
+	 */
+	if (irel->rd_indexpartinfo == NULL)
+		BuildIndexPartitionInfo(irel, CurrentMemoryContext);
+
+	/* Use the max_partid + 1 value as the next parition id. */
+	partid = irel->rd_indexpartinfo->max_partid + 1;
+
+	/*
+	 * If partitionID is wraparound then give error.
+	 * XXX here we might consider reusing the unused partition IDs.
+	 */
+	if (!PartIdIsValid(partid))
+		elog(ERROR, "could not allocate new PartitionID because limit is exhausted");
+
+	/*
+	 * Store the new value in the cache, in case the cache is invalidated we
+	 * will get the max value again from the system catalog, so there should
+	 * not be any issue.
+	 */
+	irel->rd_indexpartinfo->max_partid = partid;
+
+	return partid;
+}
diff --git a/src/include/c.h b/src/include/c.h
index 8cdc16a0f4..797891abb6 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -636,6 +636,22 @@ typedef uint32 MultiXactOffset;
 
 typedef uint32 CommandId;
 
+/*
+ * This is a new type used for global indexes. Global indexes store data
+ * from multiple partitions, so along with the heap TID, we also need a new
+ * identifier to identify the heap. We could store the reloid as well, but
+ * if the partition is detached, it becomes very hard to identify that we
+ * don't need to access data from this particular relation. It becomes even
+ * more problematic if the partition is dropped and later reloid is reused by
+ * another relation, making it difficult to distinguish whether a particular
+ * index  tuple belongs to the old relation or the new relation. To handle
+ * this, we use a new identifier called partition id. Whenever a relation is
+ * detached  from the global index, the partition id is invalidated, allowing
+ * us to easily identify that we don't need to access the tuple of this
+ * partition id.
+ */
+typedef	uint32 PartitionId;
+
 #define FirstCommandId	((CommandId) 0)
 #define InvalidCommandId	(~(CommandId)0)
 
diff --git a/src/include/catalog/Makefile b/src/include/catalog/Makefile
index 2bbc7805fe..2b76dcdc16 100644
--- a/src/include/catalog/Makefile
+++ b/src/include/catalog/Makefile
@@ -81,7 +81,8 @@ CATALOG_HEADERS := \
 	pg_publication_namespace.h \
 	pg_publication_rel.h \
 	pg_subscription.h \
-	pg_subscription_rel.h
+	pg_subscription_rel.h \
+	pg_index_partitions.h
 
 GENERATED_HEADERS := $(CATALOG_HEADERS:%.h=%_d.h)
 
diff --git a/src/include/catalog/pg_index_partitions.h b/src/include/catalog/pg_index_partitions.h
new file mode 100644
index 0000000000..2dcc8ca3fc
--- /dev/null
+++ b/src/include/catalog/pg_index_partitions.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_index_partitions.h
+ *	  definition of the system catalog (pg_index_partitions)
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/pg_index_partitions.h
+ *
+ * NOTES
+ *	  The Catalog.pm module reads this file and derives schema
+ *	  information.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_INDEX_PARTITIONS_H
+#define PG_INDEX_PARTITIONS_H
+
+#include "catalog/genbki.h"
+#include "catalog/pg_attribute.h"
+#include "catalog/pg_index_partitions_d.h"
+#include "catalog/pg_type_d.h"
+#include "utils/hsearch.h"
+#include "utils/relcache.h"
+
+/* ----------------
+ *		pg_index_partitions definition.  cpp turns this into
+ *		typedef struct FormData_pg_index_partitions
+ * ----------------
+ */
+CATALOG(pg_index_partitions,6015,IndexPartitionsRelationId)
+{
+	Oid			indexoid BKI_LOOKUP(pg_class);
+	Oid			reloid BKI_LOOKUP(pg_class);
+	int32		partid;
+} FormData_pg_index_partitions;
+
+/* ----------------
+ *		Form_pg_index_partitions corresponds to a pointer to a tuple with
+ *		the format of pg_index_partitions relation.
+ * ----------------
+ */
+typedef FormData_pg_index_partitions *Form_pg_index_partitions;
+
+DECLARE_UNIQUE_INDEX_PKEY(pg_index_partitions_indexoid_partid_index, 6018, IndexPartitionsIndexId, pg_index_partitions, btree(indexoid oid_ops, partid int4_ops));
+
+/*
+ * Map over the pg_index_partitions table for a particular global index.  This
+ * will be used for faster lookup of the next partid to be used for this global
+ * index and also for finding out the partition relation for a give partid of
+ * a global index.
+ */
+typedef struct IndexPartitionInfoData
+{
+	MemoryContext	context;	/* memory context for storing the cache data */
+	PartitionId		max_partid;	/* max value of used partid */
+	HTAB		   *pdir_hash;	/* partid to reloid lookup hash */
+} IndexPartitionInfoData;
+
+typedef IndexPartitionInfoData *IndexPartitionInfo;
+
+/*
+ * TODO we might think of storing the RelationDesc along with reloid?
+ */
+typedef struct IndexPartitionInfoEntry
+{
+	PartitionId	partid;		/* key */
+	Oid			reloid;		/* payload */
+} IndexPartitionInfoEntry;
+
+#define		InvalidPartitionId			0
+#define		FirstValidPartitionId		1
+#define		PartIdIsValid(partid)	((bool) ((partid) != InvalidPartitionId))
+
+extern void BuildIndexPartitionInfo(Relation relation, MemoryContext context);
+extern PartitionId IndexGetRelationPartitionId(Relation irel, Oid reloid);
+extern Oid IndexGetPartitionReloid(Relation irel, PartitionId partid);
+extern PartitionId IndexGetNextPartitionID(Relation irel);
+extern void DeleteIndexPartitionEntries(Oid indrelid);
+extern void InsertIndexPartitionEntry(Relation irel, Oid reloid, PartitionId partid);
+extern void InvalidateIndexPartitionEntries(List *reloids, Oid indexoid);
+#endif							/* PG_INDEX_PARTITIONS_H */
diff --git a/src/include/postgres.h b/src/include/postgres.h
index 8a41a66868..a0a24e2bca 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -536,6 +536,25 @@ Float8GetDatum(float8 X)
 extern Datum Float8GetDatum(float8 X);
 #endif
 
+/*
+ * DatumGetPartitionId
+ *		Returns partition identifier value of a datum.
+ */
+static inline PartitionId
+DatumGetPartitionId(Datum X)
+{
+	return (PartitionId) X;
+}
+
+/*
+ * PartitionIdGetDatum
+ *		Returns datum representation for a partition identifier.
+ */
+static inline Datum
+PartitionIdGetDatum(PartitionId X)
+{
+	return (Datum) X;
+}
 
 /*
  * Int64GetDatumFast
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index b552359915..35270fdc05 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -19,6 +19,7 @@
 #include "catalog/catalog.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
+#include "catalog/pg_index_partitions.h"
 #include "catalog/pg_publication.h"
 #include "nodes/bitmapset.h"
 #include "partitioning/partdefs.h"
@@ -217,6 +218,12 @@ typedef struct RelationData
 	Oid		   *rd_indcollation;	/* OIDs of index collations */
 	bytea	  **rd_opcoptions;	/* parsed opclass-specific options */
 
+	/*
+	 * Cache for mapping reloid to partition id for the global index.  For more
+	 * details refer comments in pg_index_partitions.h.
+	 */
+	IndexPartitionInfo	rd_indexpartinfo;
+
 	/*
 	 * rd_amcache is available for index and table AMs to cache private data
 	 * about the relation.  This must be just a cache since it may get reset
diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out
index 215eb899be..7588e23d6a 100644
--- a/src/test/regress/expected/oidjoins.out
+++ b/src/test/regress/expected/oidjoins.out
@@ -266,3 +266,5 @@ NOTICE:  checking pg_subscription {subdbid} => pg_database {oid}
 NOTICE:  checking pg_subscription {subowner} => pg_authid {oid}
 NOTICE:  checking pg_subscription_rel {srsubid} => pg_subscription {oid}
 NOTICE:  checking pg_subscription_rel {srrelid} => pg_class {oid}
+NOTICE:  checking pg_index_partitions {indexoid} => pg_class {oid}
+NOTICE:  checking pg_index_partitions {reloid} => pg_class {oid}
-- 
2.49.0