0001-adds-ability-to-dump-data-for-tables-in-multiple-chu.patch

application/x-patch

Filename: 0001-adds-ability-to-dump-data-for-tables-in-multiple-chu.patch
Type: application/x-patch
Part: 0
Message: Patch: dumping tables data in multiple chunks in pg_dump
From 015cc46de277971d97c3b1823a5777fccb56c270 Mon Sep 17 00:00:00 2001
From: Hannu Krosing <hannuk@google.com>
Date: Tue, 11 Nov 2025 16:11:08 +0100
Subject: [PATCH] adds ability to dump data for tables in multiple chunks
 controlled by flag --huge-table-chunk-pages

---
 src/bin/pg_dump/pg_backup.h          |   1 +
 src/bin/pg_dump/pg_backup_archiver.c |   1 +
 src/bin/pg_dump/pg_dump.c            | 157 +++++++++++++++++++++------
 src/bin/pg_dump/pg_dump.h            |   7 ++
 4 files changed, 130 insertions(+), 36 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..b71caed8b83 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -178,6 +178,7 @@ typedef struct _dumpOptions
 	bool		aclsSkip;
 	const char *lockWaitTimeout;
 	int			dump_inserts;	/* 0 = COPY, otherwise rows per INSERT */
+	int			huge_table_chunk_pages; /* chunk when relpages is above this */
 
 	/* flags for various command-line long options */
 	int			disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 59eaecb4ed7..d555e365ea5 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts)
 	opts->dumpSchema = true;
 	opts->dumpData = true;
 	opts->dumpStatistics = false;
+	opts->huge_table_chunk_pages = UINT32_MAX; /* disable chunking by default */
 }
 
 /*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a00918bacb4..e9ccc8e43ed 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -538,6 +538,7 @@ main(int argc, char **argv)
 		{"exclude-extension", required_argument, NULL, 17},
 		{"sequence-data", no_argument, &dopt.sequence_data, 1},
 		{"restrict-key", required_argument, NULL, 25},
+		{"huge-table-chunk-pages", required_argument, NULL, 26},
 
 		{NULL, 0, NULL, 0}
 	};
@@ -802,6 +803,13 @@ main(int argc, char **argv)
 				dopt.restrict_key = pg_strdup(optarg);
 				break;
 
+			case 26:			/* huge table chunk pages */
+				if (!option_parse_int(optarg, "--huge-table-chunk-pages", 1, INT32_MAX,
+									  &dopt.huge_table_chunk_pages))
+					exit_nicely(1);
+				pg_log_warning("CHUNKING: set dopt.huge_table_chunk_pages to [%u]",(BlockNumber) dopt.huge_table_chunk_pages);
+				break;
+
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1357,6 +1365,9 @@ help(const char *progname)
 	printf(_("  --extra-float-digits=NUM     override default setting for extra_float_digits\n"));
 	printf(_("  --filter=FILENAME            include or exclude objects and data from dump\n"
 			 "                               based on expressions in FILENAME\n"));
+	printf(_("  --huge-table-chunk-pages=NUMPAGES\n"
+		     "                               Number of main table pages above which data is \n"
+			 "                               copied out in chunks, also determines the chunk size\n"));
 	printf(_("  --if-exists                  use IF EXISTS when dropping objects\n"));
 	printf(_("  --include-foreign-data=PATTERN\n"
 			 "                               include data of foreign tables on foreign\n"
@@ -2397,7 +2408,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 	 * a filter condition was specified.  For other cases a simple COPY
 	 * suffices.
 	 */
-	if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+	if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
 	{
 		/* Temporary allows to access to foreign tables to dump data */
 		if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2413,9 +2424,18 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 		else
 			appendPQExpBufferStr(q, "* ");
 
-		appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+		appendPQExpBuffer(q, "FROM %s %s",
 						  fmtQualifiedDumpable(tbinfo),
 						  tdinfo->filtercond ? tdinfo->filtercond : "");
+		if (tdinfo->chunking)
+		{
+			appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+								 tdinfo->filtercond?" AND ":" WHERE ",
+								 tdinfo->startPage, tdinfo->endPage);
+			pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+		}
+		
+		appendPQExpBuffer(q, ") TO stdout;");
 	}
 	else
 	{
@@ -2423,6 +2443,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 						  fmtQualifiedDumpable(tbinfo),
 						  column_list);
 	}
+
+	pg_log_warning("CHUNKING: data query: %s", q->data);
+	
 	res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
 	PQclear(res);
 	destroyPQExpBuffer(clistBuf);
@@ -2918,42 +2941,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
 	{
 		TocEntry   *te;
 
-		te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
-						  ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
-									   .namespace = tbinfo->dobj.namespace->dobj.name,
-									   .owner = tbinfo->rolname,
-									   .description = "TABLE DATA",
-									   .section = SECTION_DATA,
-									   .createStmt = tdDefn,
-									   .copyStmt = copyStmt,
-									   .deps = &(tbinfo->dobj.dumpId),
-									   .nDeps = 1,
-									   .dumpFn = dumpFn,
-									   .dumpArg = tdinfo));
-
-		/*
-		 * Set the TocEntry's dataLength in case we are doing a parallel dump
-		 * and want to order dump jobs by table size.  We choose to measure
-		 * dataLength in table pages (including TOAST pages) during dump, so
-		 * no scaling is needed.
-		 *
-		 * However, relpages is declared as "integer" in pg_class, and hence
-		 * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
-		 * Cast so that we get the right interpretation of table sizes
-		 * exceeding INT_MAX pages.
+		/* chunking works off relpages, which may be slightly off
+		 * but is the best we have without doing our own page count
+		 * should be enough for typical use case of huge tables which 
+		 * should have their relpages updated by autovacuum
+		 * 
+		 * We shoukld likely have a slight hysteresis here to avoid
+		 * tiny shunks when relpages is close to the threshold
 		 */
-		te->dataLength = (BlockNumber) tbinfo->relpages;
-		te->dataLength += (BlockNumber) tbinfo->toastpages;
+		if ((BlockNumber) tbinfo->relpages < dopt->huge_table_chunk_pages) /* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+		{
+			pg_log_warning("CHUNKING: toc for simple relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+			te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = "TABLE DATA",
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = tdinfo));
 
-		/*
-		 * If pgoff_t is only 32 bits wide, the above refinement is useless,
-		 * and instead we'd better worry about integer overflow.  Clamp to
-		 * INT_MAX if the correct result exceeds that.
-		 */
-		if (sizeof(te->dataLength) == 4 &&
-			(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
-			 te->dataLength < 0))
-			te->dataLength = INT_MAX;
+			/*
+			* Set the TocEntry's dataLength in case we are doing a parallel dump
+			* and want to order dump jobs by table size.  We choose to measure
+			* dataLength in table pages (including TOAST pages) during dump, so
+			* no scaling is needed.
+			*
+			* However, relpages is declared as "integer" in pg_class, and hence
+			* also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+			* Cast so that we get the right interpretation of table sizes
+			* exceeding INT_MAX pages.
+			*/
+			te->dataLength = (BlockNumber) tbinfo->relpages;
+			te->dataLength += (BlockNumber) tbinfo->toastpages;
+
+			/*
+			* If pgoff_t is only 32 bits wide, the above refinement is useless,
+			* and instead we'd better worry about integer overflow.  Clamp to
+			* INT_MAX if the correct result exceeds that.
+			*/
+			if (sizeof(te->dataLength) == 4 &&
+				(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
+				te->dataLength < 0))
+				te->dataLength = INT_MAX;
+		}
+		else
+		{
+			BlockNumber current_chunk_start = 0;
+			PQExpBuffer chunk_desc = createPQExpBuffer();
+			
+			pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+			while (current_chunk_start < (BlockNumber) tbinfo->relpages)/* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+			{
+				TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+				memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+				AssignDumpId(&chunk_tdinfo->dobj);
+				//addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+				chunk_tdinfo->chunking = true;
+				chunk_tdinfo->startPage = current_chunk_start;
+				chunk_tdinfo->endPage = current_chunk_start + dopt->huge_table_chunk_pages - 1;
+
+				pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+				
+				current_chunk_start += dopt->huge_table_chunk_pages;
+				if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+					chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */
+
+				printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+				te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = chunk_desc->data,
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = chunk_tdinfo));
+
+				te->dataLength = dopt->huge_table_chunk_pages;
+				/* let's assume toast pages distribute evenly among chunks */
+				te->dataLength += (off_t)dopt->huge_table_chunk_pages * tbinfo->toastpages / tbinfo->relpages;
+			}
+
+			destroyPQExpBuffer(chunk_desc);
+		}
 	}
 
 	destroyPQExpBuffer(copyBuf);
@@ -3077,6 +3159,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
 	tdinfo->dobj.namespace = tbinfo->dobj.namespace;
 	tdinfo->tdtable = tbinfo;
 	tdinfo->filtercond = NULL;	/* might get set later */
+	tdinfo->chunking = false; /* defaults */
+	tdinfo->startPage = 0;
+	tdinfo->endPage = UINT32_MAX;
 	addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
 
 	/* A TableDataInfo contains data, of course */
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 72a00e1bc20..30e8160ea66 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
 
 #include "pg_backup.h"
 #include "catalog/pg_publication_d.h"
+#include "storage/block.h"
 
 
 #define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ?  1 : 0) )
@@ -413,6 +414,12 @@ typedef struct _tableDataInfo
 	DumpableObject dobj;
 	TableInfo  *tdtable;		/* link to table to dump */
 	char	   *filtercond;		/* WHERE condition to limit rows dumped */
+	bool 		chunking;
+	BlockNumber	startPage;		/* starting table page */
+	BlockNumber	endPage;		/* ending table page for page-range dump,
+	                    		 * usually startPage+huge_table_chunk_pages 
+								 * but we may want to do some small hysteresis to avoid single-page chunks
+								 */
 } TableDataInfo;
 
 typedef struct _indxInfo
-- 
2.51.2.1041.gc1ab5b90ca-goog