From 740c153a712939d6c65f7d31592b75512544f383 Mon Sep 17 00:00:00 2001
From: Jeevan Chalke <jeevan.chalke@enterprisedb.com>
Date: Fri, 16 Aug 2019 14:10:33 +0530
Subject: [PATCH 3/4] Add support for the incremental backup.

If file is modified 90% or more, we send a whole file else we send
only those blocks which are modified. The file is named .partial and
has following header details:

 - magic number, currently 0 (4 bytes)
 - checksum, currently 0 (4 bytes)
 - number of blocks in this .partial file (4 bytes)
 - all modified block numbers (4 bytes each)
 - modified blocks
---
 doc/src/sgml/protocol.sgml             |  50 ++++++-
 doc/src/sgml/ref/pg_basebackup.sgml    |  21 +++
 src/backend/access/transam/xlog.c      |   5 +-
 src/backend/access/transam/xlogfuncs.c |   6 +-
 src/backend/replication/basebackup.c   | 244 +++++++++++++++++++++++++++++++--
 src/backend/storage/file/fd.c          |  29 ++++
 src/include/access/xlog.h              |   3 +-
 src/include/replication/basebackup.h   |  13 ++
 src/include/storage/fd.h               |   1 +
 9 files changed, 354 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index b20f169..fb07585 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -2466,7 +2466,7 @@ The commands accepted in replication mode are:
   </varlistentry>
 
   <varlistentry>
-    <term><literal>BASE_BACKUP</literal> [ <literal>LABEL</literal> <replaceable>'label'</replaceable> ] [ <literal>PROGRESS</literal> ] [ <literal>FAST</literal> ] [ <literal>WAL</literal> ] [ <literal>NOWAIT</literal> ] [ <literal>MAX_RATE</literal> <replaceable>rate</replaceable> ] [ <literal>TABLESPACE_MAP</literal> ] [ <literal>NOVERIFY_CHECKSUMS</literal> ]
+    <term><literal>BASE_BACKUP</literal> [ <literal>LABEL</literal> <replaceable>'label'</replaceable> ] [ <literal>PROGRESS</literal> ] [ <literal>FAST</literal> ] [ <literal>WAL</literal> ] [ <literal>NOWAIT</literal> ] [ <literal>MAX_RATE</literal> <replaceable>rate</replaceable> ] [ <literal>TABLESPACE_MAP</literal> ] [ <literal>NOVERIFY_CHECKSUMS</literal> ] [ <literal>LSN</literal> <replaceable>'lsn'</replaceable> ]
      <indexterm><primary>BASE_BACKUP</primary></indexterm>
     </term>
     <listitem>
@@ -2576,6 +2576,22 @@ The commands accepted in replication mode are:
          </para>
         </listitem>
        </varlistentry>
+
+       <varlistentry>
+        <term><literal>LSN</literal> <replaceable>'lsn'</replaceable></term>
+        <listitem>
+         <para>
+          Includes only those data blocks in backup which has LSN greater than
+          or equal to the given lsn. However, if 90% or more data blocks are
+          modified in the file, then sends the entire file. Otherwise, creates
+          a <filename>.partial</filename> file containing only the blocks which
+          are modified and sends that instead. The <filename>.partial</filename>
+          file has its own header followed by the actual data blocks. Note that
+          only relation files are considered here, all other files are sent as
+          is.
+         </para>
+        </listitem>
+       </varlistentry>
       </variablelist>
      </para>
      <para>
@@ -2698,6 +2714,38 @@ The commands accepted in replication mode are:
       Owner, group, and file mode are set if the underlying file system on
       the server supports it.
      </para>
+     <para>
+      An incremental backup's <filename>.partial</filename> file has the
+      following format:
+      <itemizedlist>
+       <listitem>
+        <para>
+         Starts with a 4-byte magic number
+        </para>
+       </listitem>
+       <listitem>
+        <para>
+         Followed by a 4-byte CRC of the header (containing a magic number,
+         count of the number of blocks, and all block numbers)
+        </para>
+       </listitem>
+       <listitem>
+        <para>
+         Then a 4-byte count of the number of blocks included in the file
+        </para>
+       </listitem>
+       <listitem>
+        <para>
+         Then the block numbers, each as a 4-byte quantity
+        </para>
+       </listitem>
+       <listitem>
+        <para>
+         Followed by the actual data blocks in order with the block numbers
+        </para>
+       </listitem>
+      </itemizedlist>
+     </para>
     </listitem>
   </varlistentry>
 </variablelist>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index fc9e222..00782e0 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -408,6 +408,19 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>--lsn=<replaceable class="parameter">LSN</replaceable></option></term>
+      <listitem>
+       <para>
+        Takes an incremental backup, using LSN as a threshold. Only the blocks
+        which are modified after this given LSN will be backed up. The file
+        which has these partial blocks has .partial as an extension. Backup
+        taken in this manner has to be combined with the full backup with the
+        <command>pg_combinebackup</command> utility.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-n</option></term>
       <term><option>--no-clean</option></term>
       <listitem>
@@ -792,6 +805,14 @@ PostgreSQL documentation
 <prompt>$</prompt> <userinput>pg_basebackup -D backup/data -T /opt/ts=$(pwd)/backup/ts</userinput>
 </screen>
   </para>
+
+  <para>
+   To create an incremental backup having LSN greater than or equal to
+   <literal>5/19000060</literal>:
+<screen>
+<prompt>$</prompt> <userinput>pg_basebackup -D incbackup --lsn='5/19000060'</userinput>
+</screen>
+  </para>
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f553523..e427c0f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10178,7 +10178,7 @@ XLogRecPtr
 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 				   StringInfo labelfile, List **tablespaces,
 				   StringInfo tblspcmapfile, bool infotbssize,
-				   bool needtblspcmapfile)
+				   bool needtblspcmapfile, XLogRecPtr ref_lsn)
 {
 	bool		exclusive = (labelfile == NULL);
 	bool		backup_started_in_recovery = false;
@@ -10506,6 +10506,9 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
 		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
 		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
+		if (!XLogRecPtrIsInvalid(ref_lsn))
+			appendStringInfo(labelfile, "INCREMENTAL BACKUP REFERENCE WAL LOCATION: %X/%X\n",
+							 (uint32) (ref_lsn >> 32), (uint32) ref_lsn);
 
 		/*
 		 * Okay, write the file, or return its contents to caller.
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index b35043b..ef8b283 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -89,7 +89,8 @@ pg_start_backup(PG_FUNCTION_ARGS)
 	if (exclusive)
 	{
 		startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL,
-										NULL, NULL, false, true);
+										NULL, NULL, false, true,
+										InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -105,7 +106,8 @@ pg_start_backup(PG_FUNCTION_ARGS)
 		MemoryContextSwitchTo(oldcontext);
 
 		startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file,
-										NULL, tblspc_map_file, false, true);
+										NULL, tblspc_map_file, false, true,
+										InvalidXLogRecPtr);
 
 		before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
 	}
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 18e992c..a2c0756 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -82,6 +82,12 @@ static pgoff_t do_full_backup(const char *readfilename,
 							  const char *tarfilename, FILE *fp,
 							  struct stat *statbuf, int segmentno,
 							  bool verify_checksum, int *checksum_failures);
+static pgoff_t do_incremental_backup(const char *readfilename,
+									 const char *tarfilename, FILE *fp,
+									 XLogRecPtr refptr,
+									 struct stat *statbuf, int segmentno,
+									 bool verify_checksum,
+									 int *checksum_failures);
 
 /* Was the backup currently in-progress initiated in recovery mode? */
 static bool backup_started_in_recovery = false;
@@ -99,6 +105,11 @@ static char *statrelpath = NULL;
  */
 #define THROTTLING_FREQUENCY	8
 
+/*
+ * When to send the whole file, % blocks modified (90%)
+ */
+#define WHOLE_FILE_THRESHOLD	0.9
+
 /* The actual number of bytes, transfer of which may cause sleep. */
 static uint64 throttling_sample;
 
@@ -114,6 +125,9 @@ static TimestampTz throttled_last;
 /* The starting XLOG position of the base backup. */
 static XLogRecPtr startptr;
 
+/* The reference XLOG position for the incremental backup. */
+static XLogRecPtr refptr;
+
 /* Total number of checksum failures during base backup. */
 static long long int total_checksum_failures;
 
@@ -254,7 +268,9 @@ perform_base_backup(basebackup_options *opt)
 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
 								  labelfile, &tablespaces,
 								  tblspc_map_file,
-								  opt->progress, opt->sendtblspcmapfile);
+								  opt->progress, opt->sendtblspcmapfile,
+								  opt->lsn);
+	refptr = opt->lsn;
 
 	/*
 	 * Once do_pg_start_backup has been called, ensure that any failure causes
@@ -1392,6 +1408,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 	int			segmentno = 0;
 	char	   *segmentpath;
 	bool		verify_checksum = false;
+	char	   *filename;
 
 	fp = AllocateFile(readfilename, "rb");
 	if (fp == NULL)
@@ -1403,17 +1420,15 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 				 errmsg("could not open file \"%s\": %m", readfilename)));
 	}
 
+	/*
+	 * Get the filename (excluding path).  As last_dir_separator() includes
+	 * the last directory separator, we chop that off by incrementing the
+	 * pointer.
+	 */
+	filename = last_dir_separator(readfilename) + 1;
+
 	if (!noverify_checksums && DataChecksumsEnabled())
 	{
-		char	   *filename;
-
-		/*
-		 * Get the filename (excluding path).  As last_dir_separator()
-		 * includes the last directory separator, we chop that off by
-		 * incrementing the pointer.
-		 */
-		filename = last_dir_separator(readfilename) + 1;
-
 		if (is_checksummed_file(readfilename, filename))
 		{
 			verify_checksum = true;
@@ -1434,9 +1449,23 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 		}
 	}
 
-	/* Perform full backup */
-	len = do_full_backup(readfilename, tarfilename, fp, statbuf, segmentno,
-						 verify_checksum, &checksum_failures);
+	/*
+	 * If incremental backup, see whether the filename is a relation filename
+	 * or not.
+	 */
+	if (refptr && OidIsValid(dboid) && looks_like_rel_name(filename))
+	{
+		/* Perform incremental backup */
+		len = do_incremental_backup(readfilename, tarfilename, fp,
+									refptr, statbuf, segmentno,
+									verify_checksum, &checksum_failures);
+	}
+	else
+	{
+		/* Perform full backup */
+		len = do_full_backup(readfilename, tarfilename, fp, statbuf, segmentno,
+							 verify_checksum, &checksum_failures);
+	}
 
 	/* If the file was truncated while we were sending it, pad it with zeros */
 	if (len < statbuf->st_size)
@@ -1775,3 +1804,192 @@ do_full_backup(const char *readfilename, const char *tarfilename, FILE *fp,
 
 	return len;
 }
+
+/*
+ * do_incremental_backup
+ *
+ * Perform incremental backup.
+ */
+static pgoff_t
+do_incremental_backup(const char *readfilename, const char *tarfilename,
+					  FILE *fp, XLogRecPtr refptr, struct stat *statbuf,
+					  int segmentno, bool verify_checksum,
+					  int *checksum_failures)
+{
+	char	   *buf;
+	off_t		cnt;
+	pgoff_t		len = 0;
+	BlockNumber blkno = 0;
+	int			i;
+	bool		sendwholefile = false;
+
+	Assert(statbuf->st_size <= (RELSEG_SIZE * BLCKSZ));
+
+	buf = (char *) malloc(statbuf->st_size);
+	if (buf == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	if ((cnt = fread(buf, 1, statbuf->st_size, fp)) > 0)
+	{
+		Bitmapset  *mod_blocks = NULL;
+		int			nmodblocks = 0;
+
+		if (cnt % BLCKSZ != 0)
+		{
+			if (verify_checksum)
+			{
+				ereport(WARNING,
+						(errmsg("cannot verify checksum in file \"%s\", block "
+								"%d: read buffer size %d and page size %d "
+								"differ",
+								readfilename, blkno, (int) cnt, BLCKSZ)));
+				verify_checksum = false;
+			}
+
+			ereport(WARNING,
+					(errmsg("file size (%d) not in multiple of page size (%d), sending whole file",
+							(int) cnt, BLCKSZ)));
+
+			/* File size is not in multiple of BLCKSZ, send as is. */
+			sendwholefile = true;
+		}
+
+		/*
+		 * Check each page LSN and see if it is modified after the given LSN or
+		 * not.  Create a bitmap of all such modified blocks and then decide
+		 * whether we want to send a whole file or a partial file.  Skip this
+		 * check if we decided to send whole file already.
+		 */
+		if (!sendwholefile)
+		{
+			XLogRecPtr	pglsn;
+			int			nblocks = (cnt / BLCKSZ);
+
+			for (i = 0; i < nblocks; i++)
+			{
+				char	   *page = buf + BLCKSZ * i;
+
+				pglsn = PageGetLSN(page);
+
+				if (pglsn >= refptr)
+				{
+					/*
+					 * Verify checksum, if requested, for the modified blocks.
+					 */
+					if (verify_checksum)
+						verify_page_checksum(readfilename, fp, buf, cnt, i,
+											 blkno, segmentno,
+											 checksum_failures);
+
+					mod_blocks = bms_add_member(mod_blocks, i);
+				}
+
+				blkno++;
+			}
+
+			nmodblocks = bms_num_members(mod_blocks);
+
+			/*
+			 * We need to send whole file if the modified block count is equal
+			 * to or greater than the WHOLE_FILE_THRESHOLD.  Check that.
+			 */
+			if (i > 0 && (nmodblocks / (double) i) >= WHOLE_FILE_THRESHOLD)
+				sendwholefile = true;
+		}
+
+		/*
+		 * If sendwholefile is true then we need to send the whole file as is.
+		 * Otherwise send a partial file.
+		 */
+		if (sendwholefile)
+		{
+			_tarWriteHeader(tarfilename, NULL, statbuf, false);
+
+			/* Send the chunk as a CopyData message */
+			if (pq_putmessage('d', buf, cnt))
+				ereport(ERROR,
+						(errmsg("base backup could not send data, aborting backup")));
+
+			len = cnt;
+			throttle(cnt);
+		}
+		else
+		{
+			int			part_size = 0;
+			int			part_header_size;
+			int			blknum;
+			int			blknocnt;
+			partial_file_header *pfh;
+			char	   *partialtarfilename = NULL;
+
+			/* Create a partial file */
+
+			/* Calculate partial file size. */
+			part_header_size = offsetof(partial_file_header, blocknumbers) +
+				(sizeof(uint32) * nmodblocks);
+			part_size = part_header_size + (BLCKSZ * nmodblocks);
+
+			/* Add .partial to filename */
+			partialtarfilename = (char *) palloc(strlen(tarfilename) + 9);
+			snprintf(partialtarfilename, strlen(tarfilename) + 9, "%s.partial", tarfilename);
+
+			statbuf->st_size = part_size;
+			_tarWriteHeader(partialtarfilename, NULL, statbuf, false);
+			pfree(partialtarfilename);
+
+			pfh = (partial_file_header *) palloc(part_header_size);
+			pfh->magic = INCREMENTAL_BACKUP_MAGIC;
+			pfh->nblocks = nmodblocks;
+
+			blknum = -1;
+			blknocnt = 0;
+			while ((blknum = bms_next_member(mod_blocks, blknum)) >= 0)
+			{
+				pfh->blocknumbers[blknocnt] = blknum;
+				/* Calculate CRC for each block to be transferred. */
+				blknocnt++;
+			}
+
+			Assert(blknocnt == nmodblocks);
+
+			/* Now calculate CRC for the header */
+			INIT_CRC32C(pfh->checksum);
+			COMP_CRC32C(pfh->checksum, pfh, part_header_size);
+
+			/* Send header */
+			if (pq_putmessage('d', (char *) pfh, part_header_size))
+				ereport(ERROR,
+						(errmsg("base backup could not send data, aborting backup")));
+			throttle(part_header_size);
+
+			/* Send data blocks */
+			for (blknocnt = 0; blknocnt < nmodblocks; blknocnt++)
+			{
+				int			offset = BLCKSZ * pfh->blocknumbers[blknocnt];
+
+				if (pq_putmessage('d', buf + offset, BLCKSZ))
+					ereport(ERROR,
+							(errmsg("base backup could not send data, aborting backup")));
+				throttle(BLCKSZ);
+			}
+
+			Assert(blknocnt == nmodblocks);
+
+			len = part_size;
+			pfree(pfh);
+		}
+	}
+	else
+	{
+		/* Send empty file as is */
+		_tarWriteHeader(tarfilename, NULL, statbuf, false);
+		len = cnt;
+	}
+
+	/* free buffer allocated */
+	free(buf);
+
+	return len;
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index a76112d..2990c52 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -3111,6 +3111,35 @@ looks_like_temp_rel_name(const char *name)
 	return true;
 }
 
+/* <digits>, or <digits>.<digits> */
+bool
+looks_like_rel_name(const char *name)
+{
+	int			pos;
+
+	/* Look for a non-empty string of digits (that isn't too long). */
+	for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (pos == 0 || pos > OIDCHARS)
+		return false;
+
+	if (name[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+			;
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now we should be at the end. */
+	if (name[pos] != '\0')
+		return false;
+	return true;
+}
+
 
 /*
  * Issue fsync recursively on PGDATA and all its contents.
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d519252..155385d 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -347,7 +347,8 @@ typedef enum SessionBackupState
 extern XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast,
 									 TimeLineID *starttli_p, StringInfo labelfile,
 									 List **tablespaces, StringInfo tblspcmapfile, bool infotbssize,
-									 bool needtblspcmapfile);
+									 bool needtblspcmapfile,
+									 XLogRecPtr ref_lsn);
 extern XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive,
 									TimeLineID *stoptli_p);
 extern void do_pg_abort_backup(void);
diff --git a/src/include/replication/basebackup.h b/src/include/replication/basebackup.h
index 503a5b9..1b35b08 100644
--- a/src/include/replication/basebackup.h
+++ b/src/include/replication/basebackup.h
@@ -20,6 +20,9 @@
 #define MAX_RATE_LOWER	32
 #define MAX_RATE_UPPER	1048576
 
+/* magic number in incremental backup's .partial file */
+#define INCREMENTAL_BACKUP_MAGIC	0x494E4352
+
 
 typedef struct
 {
@@ -29,6 +32,16 @@ typedef struct
 	int64		size;
 } tablespaceinfo;
 
+/* Definition of the partial file header */
+typedef struct
+{
+	uint32		magic;
+	pg_crc32c	checksum;
+	uint32		nblocks;
+	uint32		blocknumbers[FLEXIBLE_ARRAY_MEMBER];
+} partial_file_header;
+
+
 extern void SendBaseBackup(BaseBackupCmd *cmd);
 
 extern int64 sendTablespace(char *path, bool sizeonly);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index d2a8c52..d25a390 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -136,6 +136,7 @@ extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
 							  SubTransactionId parentSubid);
 extern void RemovePgTempFiles(void);
 extern bool looks_like_temp_rel_name(const char *name);
+extern bool looks_like_rel_name(const char *name);
 
 extern int	pg_fsync(int fd);
 extern int	pg_fsync_no_writethrough(int fd);
-- 
1.8.3.1

