v2-0002-Fall-back-to-foreign-insert-when-tuple-is-too-big.patch

text/x-diff
Filename: v2-0002-Fall-back-to-foreign-insert-when-tuple-is-too-big.patch
Type: text/x-diff
Part: 1
Message: Re: Limit memory usage by postgres_fdw batches
From f9a2ec48d907c37259fc6658a6a0ea5925143a42 Mon Sep 17 00:00:00 2001
From: Alexander Pyhalov <a.pyhalov@postgrespro.ru>
Date: Tue, 30 Dec 2025 10:35:22 +0300
Subject: [PATCH 2/3] Fall back to foreign insert when tuple is too big

When tuple is more then work_mem, we don't form batch,
but fall through to usual foreign insert. This means
that executor can form batch, insert it and finish with
a usual insert, so there will be no pending batch inserts
to process. To avoid manipulation with possibly long lists,
we don't clear es_insert_pending_result_relations and
es_insert_pending_modifytables lists, but check in
ExecPendingInserts() if there are any pending tuples.
---
 .../postgres_fdw/expected/postgres_fdw.out    |  11 ++
 contrib/postgres_fdw/sql/postgres_fdw.sql     |   9 ++
 src/backend/executor/nodeModifyTable.c        | 138 ++++++++++--------
 src/include/nodes/execnodes.h                 |   1 +
 4 files changed, 96 insertions(+), 63 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 48e3185b227..a8bfec9413c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -7640,6 +7640,17 @@ select count(*) from tab_batch_sharded;
     45
 (1 row)
 
+delete from tab_batch_sharded;
+-- test batch insert with large tuples and switching from batch to usual insert
+set work_mem to 64;
+insert into tab_batch_sharded select 3, case when i%4 = 0 then lpad('a',65*1024,'a') else 'test'|| i end from generate_series(1, 100) i;
+select count(*) from tab_batch_sharded;
+ count 
+-------
+   100
+(1 row)
+
+reset work_mem;
 drop table tab_batch_local;
 drop table tab_batch_sharded;
 drop table tab_batch_sharded_p1_remote;
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 9a8f9e28135..467943ca32e 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -1936,10 +1936,19 @@ create foreign table tab_batch_sharded_p1 partition of tab_batch_sharded
   server loopback options (table_name 'tab_batch_sharded_p1_remote');
 insert into tab_batch_sharded select * from tab_batch_local;
 select count(*) from tab_batch_sharded;
+delete from tab_batch_sharded;
+-- test batch insert with large tuples and switching from batch to usual insert
+set work_mem to 64;
+insert into tab_batch_sharded select 3, case when i%4 = 0 then lpad('a',65*1024,'a') else 'test'|| i end from generate_series(1, 100) i;
+select count(*) from tab_batch_sharded;
+reset work_mem;
+
 drop table tab_batch_local;
 drop table tab_batch_sharded;
 drop table tab_batch_sharded_p1_remote;
 
+
+
 alter server loopback options (drop batch_size);
 
 -- ===================================================================
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a28beca63ff..2e6dcdff7bf 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -942,7 +942,6 @@ ExecInsert(ModifyTableContext *context,
 		 */
 		if (resultRelInfo->ri_BatchSize > 1)
 		{
-			bool		flushed = false;
 			Size		tuple_len;
 
 			/* Compute length of the current tuple */
@@ -961,76 +960,82 @@ ExecInsert(ModifyTableContext *context,
 								resultRelInfo->ri_PlanSlots,
 								resultRelInfo->ri_NumSlots,
 								estate, canSetTag);
-				flushed = true;
 			}
 
-			oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
-
-			if (resultRelInfo->ri_Slots == NULL)
+			if (tuple_len < work_mem * 1024)
 			{
-				resultRelInfo->ri_Slots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
-				resultRelInfo->ri_PlanSlots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
-			}
+				oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
 
-			/*
-			 * Initialize the batch slots. We don't know how many slots will
-			 * be needed, so we initialize them as the batch grows, and we
-			 * keep them across batches. To mitigate an inefficiency in how
-			 * resource owner handles objects with many references (as with
-			 * many slots all referencing the same tuple descriptor) we copy
-			 * the appropriate tuple descriptor for each slot.
-			 */
-			if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
-			{
-				TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
-				TupleDesc	plan_tdesc =
-					CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
+				if (resultRelInfo->ri_Slots == NULL)
+				{
+					resultRelInfo->ri_Slots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
+					resultRelInfo->ri_PlanSlots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
+				}
 
-				resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
-					MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
+				/*
+				 * Initialize the batch slots. We don't know how many slots
+				 * will be needed, so we initialize them as the batch grows,
+				 * and we keep them across batches. To mitigate an
+				 * inefficiency in how resource owner handles objects with
+				 * many references (as with many slots all referencing the
+				 * same tuple descriptor) we copy the appropriate tuple
+				 * descriptor for each slot.
+				 */
+				if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
+				{
+					TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+					TupleDesc	plan_tdesc =
+						CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
 
-				resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
-					MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
+					resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
+						MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
 
-				/* remember how many batch slots we initialized */
-				resultRelInfo->ri_NumSlotsInitialized++;
-			}
+					resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
+						MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
 
-			ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
-						 slot);
+					/* remember how many batch slots we initialized */
+					resultRelInfo->ri_NumSlotsInitialized++;
+				}
 
-			ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
-						 planSlot);
+				ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
+							 slot);
 
-			/*
-			 * If these are the first tuples stored in the buffers, add the
-			 * target rel and the mtstate to the
-			 * es_insert_pending_result_relations and
-			 * es_insert_pending_modifytables lists respectively, except in
-			 * the case where flushing was done above, in which case they
-			 * would already have been added to the lists, so no need to do
-			 * this.
-			 */
-			if (resultRelInfo->ri_NumSlots == 0 && !flushed)
-			{
-				Assert(!list_member_ptr(estate->es_insert_pending_result_relations,
-										resultRelInfo));
-				estate->es_insert_pending_result_relations =
-					lappend(estate->es_insert_pending_result_relations,
-							resultRelInfo);
-				estate->es_insert_pending_modifytables =
-					lappend(estate->es_insert_pending_modifytables, mtstate);
-			}
-			Assert(list_member_ptr(estate->es_insert_pending_result_relations,
-								   resultRelInfo));
+				ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
+							 planSlot);
 
-			resultRelInfo->ri_NumSlots++;
-			/* Batch size can grow up to tuple length even if it exceeds work_mem. */
-			resultRelInfo->ri_BatchMemoryUsed += tuple_len;
+				/*
+				 * If these are the first tuples stored in the buffers, add
+				 * the target rel and the mtstate to the
+				 * es_insert_pending_result_relations and
+				 * es_insert_pending_modifytables lists respectively, except
+				 * in the case where flushing was done above, in which case
+				 * they would already have been added to the lists, so no need
+				 * to do this.
+				 */
+				if (resultRelInfo->ri_NumSlots == 0 && !resultRelInfo->ri_ExecutorPendingStateModified)
+				{
+					Assert(!list_member_ptr(estate->es_insert_pending_result_relations,
+											resultRelInfo));
+					estate->es_insert_pending_result_relations =
+						lappend(estate->es_insert_pending_result_relations,
+								resultRelInfo);
+					estate->es_insert_pending_modifytables =
+						lappend(estate->es_insert_pending_modifytables, mtstate);
+					resultRelInfo->ri_ExecutorPendingStateModified = true;
+				}
+				Assert(list_member_ptr(estate->es_insert_pending_result_relations,
+									   resultRelInfo));
 
-			MemoryContextSwitchTo(oldContext);
+				resultRelInfo->ri_NumSlots++;
+				resultRelInfo->ri_BatchMemoryUsed += tuple_len;
+				/* We've flushed batch if it is too big. */
+				Assert(resultRelInfo->ri_BatchMemoryUsed < work_mem * 1024);
 
-			return NULL;
+				MemoryContextSwitchTo(oldContext);
+
+				return NULL;
+			}
+			/* else do usual foreign insert */
 		}
 
 		/*
@@ -1445,11 +1450,18 @@ ExecPendingInserts(EState *estate)
 		ModifyTableState *mtstate = (ModifyTableState *) lfirst(l2);
 
 		Assert(mtstate);
-		ExecBatchInsert(mtstate, resultRelInfo,
-						resultRelInfo->ri_Slots,
-						resultRelInfo->ri_PlanSlots,
-						resultRelInfo->ri_NumSlots,
-						estate, mtstate->canSetTag);
+
+		/*
+		 * Batch insert could switch to non-batched insert, in this case we
+		 * could have no filled slots.
+		 */
+		if (resultRelInfo->ri_NumSlots > 0)
+			ExecBatchInsert(mtstate, resultRelInfo,
+							resultRelInfo->ri_Slots,
+							resultRelInfo->ri_PlanSlots,
+							resultRelInfo->ri_NumSlots,
+							estate, mtstate->canSetTag);
+		resultRelInfo->ri_ExecutorPendingStateModified = false;
 	}
 
 	list_free(estate->es_insert_pending_result_relations);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b82ec938228..dcc23774a4c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -543,6 +543,7 @@ typedef struct ResultRelInfo
 	int			ri_NumSlotsInitialized; /* number of initialized slots */
 	int			ri_BatchSize;	/* max slots inserted in a single batch */
 	int			ri_BatchMemoryUsed;	/* memory used by batch */
+	bool			ri_ExecutorPendingStateModified; /* true if member of estate->es_insert_pending_result_relations */
 	TupleTableSlot **ri_Slots;	/* input tuples for batch insert */
 	TupleTableSlot **ri_PlanSlots;
 
-- 
2.43.0