0001-adds-ability-to-dump-data-for-tables-in-multiple-chu.patch
application/x-patch
Filename: 0001-adds-ability-to-dump-data-for-tables-in-multiple-chu.patch
Type: application/x-patch
Part: 0
From 015cc46de277971d97c3b1823a5777fccb56c270 Mon Sep 17 00:00:00 2001
From: Hannu Krosing <hannuk@google.com>
Date: Tue, 11 Nov 2025 16:11:08 +0100
Subject: [PATCH] adds ability to dump data for tables in multiple chunks
controlled by flag --huge-table-chunk-pages
---
src/bin/pg_dump/pg_backup.h | 1 +
src/bin/pg_dump/pg_backup_archiver.c | 1 +
src/bin/pg_dump/pg_dump.c | 157 +++++++++++++++++++++------
src/bin/pg_dump/pg_dump.h | 7 ++
4 files changed, 130 insertions(+), 36 deletions(-)
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..b71caed8b83 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -178,6 +178,7 @@ typedef struct _dumpOptions
bool aclsSkip;
const char *lockWaitTimeout;
int dump_inserts; /* 0 = COPY, otherwise rows per INSERT */
+ int huge_table_chunk_pages; /* chunk when relpages is above this */
/* flags for various command-line long options */
int disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 59eaecb4ed7..d555e365ea5 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts)
opts->dumpSchema = true;
opts->dumpData = true;
opts->dumpStatistics = false;
+ opts->huge_table_chunk_pages = UINT32_MAX; /* disable chunking by default */
}
/*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a00918bacb4..e9ccc8e43ed 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -538,6 +538,7 @@ main(int argc, char **argv)
{"exclude-extension", required_argument, NULL, 17},
{"sequence-data", no_argument, &dopt.sequence_data, 1},
{"restrict-key", required_argument, NULL, 25},
+ {"huge-table-chunk-pages", required_argument, NULL, 26},
{NULL, 0, NULL, 0}
};
@@ -802,6 +803,13 @@ main(int argc, char **argv)
dopt.restrict_key = pg_strdup(optarg);
break;
+ case 26: /* huge table chunk pages */
+ if (!option_parse_int(optarg, "--huge-table-chunk-pages", 1, INT32_MAX,
+ &dopt.huge_table_chunk_pages))
+ exit_nicely(1);
+ pg_log_warning("CHUNKING: set dopt.huge_table_chunk_pages to [%u]",(BlockNumber) dopt.huge_table_chunk_pages);
+ break;
+
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1357,6 +1365,9 @@ help(const char *progname)
printf(_(" --extra-float-digits=NUM override default setting for extra_float_digits\n"));
printf(_(" --filter=FILENAME include or exclude objects and data from dump\n"
" based on expressions in FILENAME\n"));
+ printf(_(" --huge-table-chunk-pages=NUMPAGES\n"
+ " Number of main table pages above which data is \n"
+ " copied out in chunks, also determines the chunk size\n"));
printf(_(" --if-exists use IF EXISTS when dropping objects\n"));
printf(_(" --include-foreign-data=PATTERN\n"
" include data of foreign tables on foreign\n"
@@ -2397,7 +2408,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
* a filter condition was specified. For other cases a simple COPY
* suffices.
*/
- if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+ if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
{
/* Temporary allows to access to foreign tables to dump data */
if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2413,9 +2424,18 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
else
appendPQExpBufferStr(q, "* ");
- appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+ appendPQExpBuffer(q, "FROM %s %s",
fmtQualifiedDumpable(tbinfo),
tdinfo->filtercond ? tdinfo->filtercond : "");
+ if (tdinfo->chunking)
+ {
+ appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+ tdinfo->filtercond?" AND ":" WHERE ",
+ tdinfo->startPage, tdinfo->endPage);
+ pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+ }
+
+ appendPQExpBuffer(q, ") TO stdout;");
}
else
{
@@ -2423,6 +2443,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
fmtQualifiedDumpable(tbinfo),
column_list);
}
+
+ pg_log_warning("CHUNKING: data query: %s", q->data);
+
res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
PQclear(res);
destroyPQExpBuffer(clistBuf);
@@ -2918,42 +2941,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
{
TocEntry *te;
- te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
- ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
- .namespace = tbinfo->dobj.namespace->dobj.name,
- .owner = tbinfo->rolname,
- .description = "TABLE DATA",
- .section = SECTION_DATA,
- .createStmt = tdDefn,
- .copyStmt = copyStmt,
- .deps = &(tbinfo->dobj.dumpId),
- .nDeps = 1,
- .dumpFn = dumpFn,
- .dumpArg = tdinfo));
-
- /*
- * Set the TocEntry's dataLength in case we are doing a parallel dump
- * and want to order dump jobs by table size. We choose to measure
- * dataLength in table pages (including TOAST pages) during dump, so
- * no scaling is needed.
- *
- * However, relpages is declared as "integer" in pg_class, and hence
- * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
- * Cast so that we get the right interpretation of table sizes
- * exceeding INT_MAX pages.
+ /* chunking works off relpages, which may be slightly off
+ * but is the best we have without doing our own page count
+ * should be enough for typical use case of huge tables which
+ * should have their relpages updated by autovacuum
+ *
+ * We shoukld likely have a slight hysteresis here to avoid
+ * tiny shunks when relpages is close to the threshold
*/
- te->dataLength = (BlockNumber) tbinfo->relpages;
- te->dataLength += (BlockNumber) tbinfo->toastpages;
+ if ((BlockNumber) tbinfo->relpages < dopt->huge_table_chunk_pages) /* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+ {
+ pg_log_warning("CHUNKING: toc for simple relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+ te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = "TABLE DATA",
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = tdinfo));
- /*
- * If pgoff_t is only 32 bits wide, the above refinement is useless,
- * and instead we'd better worry about integer overflow. Clamp to
- * INT_MAX if the correct result exceeds that.
- */
- if (sizeof(te->dataLength) == 4 &&
- (tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
- te->dataLength < 0))
- te->dataLength = INT_MAX;
+ /*
+ * Set the TocEntry's dataLength in case we are doing a parallel dump
+ * and want to order dump jobs by table size. We choose to measure
+ * dataLength in table pages (including TOAST pages) during dump, so
+ * no scaling is needed.
+ *
+ * However, relpages is declared as "integer" in pg_class, and hence
+ * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+ * Cast so that we get the right interpretation of table sizes
+ * exceeding INT_MAX pages.
+ */
+ te->dataLength = (BlockNumber) tbinfo->relpages;
+ te->dataLength += (BlockNumber) tbinfo->toastpages;
+
+ /*
+ * If pgoff_t is only 32 bits wide, the above refinement is useless,
+ * and instead we'd better worry about integer overflow. Clamp to
+ * INT_MAX if the correct result exceeds that.
+ */
+ if (sizeof(te->dataLength) == 4 &&
+ (tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
+ te->dataLength < 0))
+ te->dataLength = INT_MAX;
+ }
+ else
+ {
+ BlockNumber current_chunk_start = 0;
+ PQExpBuffer chunk_desc = createPQExpBuffer();
+
+ pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+ while (current_chunk_start < (BlockNumber) tbinfo->relpages)/* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+ {
+ TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+ memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+ AssignDumpId(&chunk_tdinfo->dobj);
+ //addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+ chunk_tdinfo->chunking = true;
+ chunk_tdinfo->startPage = current_chunk_start;
+ chunk_tdinfo->endPage = current_chunk_start + dopt->huge_table_chunk_pages - 1;
+
+ pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ current_chunk_start += dopt->huge_table_chunk_pages;
+ if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+ chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */
+
+ printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = chunk_desc->data,
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = chunk_tdinfo));
+
+ te->dataLength = dopt->huge_table_chunk_pages;
+ /* let's assume toast pages distribute evenly among chunks */
+ te->dataLength += (off_t)dopt->huge_table_chunk_pages * tbinfo->toastpages / tbinfo->relpages;
+ }
+
+ destroyPQExpBuffer(chunk_desc);
+ }
}
destroyPQExpBuffer(copyBuf);
@@ -3077,6 +3159,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
tdinfo->dobj.namespace = tbinfo->dobj.namespace;
tdinfo->tdtable = tbinfo;
tdinfo->filtercond = NULL; /* might get set later */
+ tdinfo->chunking = false; /* defaults */
+ tdinfo->startPage = 0;
+ tdinfo->endPage = UINT32_MAX;
addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
/* A TableDataInfo contains data, of course */
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 72a00e1bc20..30e8160ea66 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
#include "pg_backup.h"
#include "catalog/pg_publication_d.h"
+#include "storage/block.h"
#define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) )
@@ -413,6 +414,12 @@ typedef struct _tableDataInfo
DumpableObject dobj;
TableInfo *tdtable; /* link to table to dump */
char *filtercond; /* WHERE condition to limit rows dumped */
+ bool chunking;
+ BlockNumber startPage; /* starting table page */
+ BlockNumber endPage; /* ending table page for page-range dump,
+ * usually startPage+huge_table_chunk_pages
+ * but we may want to do some small hysteresis to avoid single-page chunks
+ */
} TableDataInfo;
typedef struct _indxInfo
--
2.51.2.1041.gc1ab5b90ca-goog