v8-0003-pg_validatebackup-Validate-a-backup-against-the-b.patch

application/octet-stream

Filename: v8-0003-pg_validatebackup-Validate-a-backup-against-the-b.patch
Type: application/octet-stream
Part: 1
Message: Re: backup manifests

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v8-0003
Subject: pg_validatebackup: Validate a backup against the backup manifest.
File+
src/backend/replication/basebackup.c 3 3
src/bin/Makefile 1 0
src/bin/pg_validatebackup/.gitignore 1 0
src/bin/pg_validatebackup/Makefile 32 0
src/bin/pg_validatebackup/pg_validatebackup.c 1089 0
From 52b6e04e1e3a2535770c177ab1c0ee0baa2c35a5 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 7 Feb 2020 17:17:52 -0500
Subject: [PATCH v8 3/5] pg_validatebackup: Validate a backup against the
 backup manifest.

Patch by me; some off-list review and testing from Mark Dilger,
Davinder Singh, Tushar Ahuja, Rajkumar Raghuwanshi, and Jeevan
Chalke.

(I chose here to make this a separate utility; Suraj wrote a previous
patch for this that made it part of pg_basebackup. Doing it this way
lets us have various command line options that are specific to backup
validation. I've added a few such options and we might want to add
more later.  I also arranged things so that checksum failures are
reported last, as that is the most expensive part of validation. I
believe that my version also does better error checking and reporting.)
---
 src/backend/replication/basebackup.c          |    6 +-
 src/bin/Makefile                              |    1 +
 src/bin/pg_validatebackup/.gitignore          |    1 +
 src/bin/pg_validatebackup/Makefile            |   32 +
 src/bin/pg_validatebackup/pg_validatebackup.c | 1089 +++++++++++++++++
 5 files changed, 1126 insertions(+), 3 deletions(-)
 create mode 100644 src/bin/pg_validatebackup/.gitignore
 create mode 100644 src/bin/pg_validatebackup/Makefile
 create mode 100644 src/bin/pg_validatebackup/pg_validatebackup.c

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 1729931597..99e102b2a7 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -63,7 +63,7 @@ struct manifest_info
 	pg_checksum_type checksum_type;
 	pg_sha256_ctx manifest_ctx;
 	uint64		manifest_size;
-	int			still_checksumming;
+	bool		still_checksumming;
 };
 
 
@@ -84,7 +84,7 @@ static void SendBackupHeader(List *tablespaces);
 static void InitializeManifest(manifest_info *manifest, pg_checksum_type);
 static void AppendStringToManifest(manifest_info *manifest, char *s);
 static void AddFileToManifest(manifest_info *manifest, const char *spcoid,
-							  const char *filename, size_t size, time_t mtime,
+							  const char *pathname, size_t size, time_t mtime,
 							  pg_checksum_context *checksum_ctx);
 static void SendBackupManifest(manifest_info *manifest);
 static char *escape_field_for_manifest(const char *s);
@@ -976,7 +976,7 @@ AppendStringToManifest(manifest_info *manifest, char *s)
  */
 static void
 AddFileToManifest(manifest_info *manifest, const char *spcoid,
-				  const char *filename, size_t size, time_t mtime,
+				  const char *pathname, size_t size, time_t mtime,
 				  pg_checksum_context *checksum_ctx)
 {
 	char		pathbuf[MAXPGPATH];
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 7f4120a34f..77bceea4fe 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -27,6 +27,7 @@ SUBDIRS = \
 	pg_test_fsync \
 	pg_test_timing \
 	pg_upgrade \
+	pg_validatebackup \
 	pg_waldump \
 	pgbench \
 	psql \
diff --git a/src/bin/pg_validatebackup/.gitignore b/src/bin/pg_validatebackup/.gitignore
new file mode 100644
index 0000000000..3ae1c1f03a
--- /dev/null
+++ b/src/bin/pg_validatebackup/.gitignore
@@ -0,0 +1 @@
+/pg_validatebackup
diff --git a/src/bin/pg_validatebackup/Makefile b/src/bin/pg_validatebackup/Makefile
new file mode 100644
index 0000000000..aeb97d21d2
--- /dev/null
+++ b/src/bin/pg_validatebackup/Makefile
@@ -0,0 +1,32 @@
+# src/bin/pg_validatebackup/Makefile
+
+PGFILEDESC = "pg_validatebackup - validate a backup against a backup manifest"
+PGAPPICON = win32
+
+subdir = src/bin/pg_validatebackup
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+# We need libpq only because fe_utils does.
+LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
+
+OBJS = \
+	$(WIN32RES) \
+	pg_validatebackup.o
+
+all: pg_validatebackup
+
+pg_validatebackup: $(OBJS) | submake-libpq submake-libpgport submake-libpgfeutils
+	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
+
+install: all installdirs
+	$(INSTALL_PROGRAM) pg_validatebackup$(X) '$(DESTDIR)$(bindir)/pg_validatebackup$(X)'
+
+installdirs:
+	$(MKDIR_P) '$(DESTDIR)$(bindir)'
+
+uninstall:
+	rm -f '$(DESTDIR)$(bindir)/pg_validatebackup$(X)'
+
+clean distclean maintainer-clean:
+	rm -f pg_validatebackup$(X) $(OBJS)
diff --git a/src/bin/pg_validatebackup/pg_validatebackup.c b/src/bin/pg_validatebackup/pg_validatebackup.c
new file mode 100644
index 0000000000..4f47b20855
--- /dev/null
+++ b/src/bin/pg_validatebackup/pg_validatebackup.c
@@ -0,0 +1,1089 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_validatebackup.c
+ *	  Validate a backup against a backup manifest.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/bin/pg_validatebackup/pg_validatebackup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "common/checksum_helper.h"
+#include "common/hashfn.h"
+#include "common/logging.h"
+#include "fe_utils/simple_list.h"
+#include "getopt_long.h"
+
+/*
+ * For efficiency, we'd like our hash table containing information about the
+ * manifest to start out with approximately the correct number of entries.
+ * There's no way to know the exact number of entries without reading the whole
+ * file, but we can get an estimate by dividing the file size by the estimated
+ * number of bytes per line.
+ *
+ * This could be off by about a factor of two in either direction, because the
+ * checksum algorithm has a big impact on the line lengths; e.g. a SHA512
+ * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there
+ * might be no checksum at all.
+ */
+#define ESTIMATED_BYTES_PER_MANIFEST_LINE	100
+
+/*
+ * How many bytes should we try to read from a file at once?
+ */
+#define READ_CHUNK_SIZE				4096
+
+/*
+ * The first word of each line of the manifest file should be one of these
+ * key words. We define constants for the relevant lengths as well.
+ */
+#define KW_MANIFEST_VERSION			"PostgreSQL-Backup-Manifest-Version"
+#define KW_MANIFEST_FILE			"File"
+#define KW_MANIFEST_CHECKSUM		"Manifest-Checksum"
+#define KWL_MANIFEST_VERSION		(sizeof(KW_MANIFEST_VERSION)-1)
+#define KWL_MANIFEST_FILE			(sizeof(KW_MANIFEST_FILE)-1)
+#define KWL_MANIFEST_CHECKSUM		(sizeof(KW_MANIFEST_CHECKSUM)-1)
+
+/*
+ * How many fields are there for each "File" line in the manifest?
+ * Currently we have: file name, file size, timestamp, checksum.
+ */
+#define FIELDS_PER_FILE_LINE		4
+
+/*
+ * Each "File" line in the manifest file is parsed to produce an object
+ * like this.
+ */
+typedef struct manifestfile
+{
+	uint32		status;			/* hash status */
+	char	   *pathname;
+	size_t		size;
+	pg_checksum_type checksum_type;
+	int			checksum_length;
+	uint8	   *checksum_payload;
+	bool		matched;
+	bool		bad;
+} manifestfile;
+
+/*
+ * Define a hash table which we can use to store information about the files
+ * mentioned in the backup manifest.
+ */
+static uint32 hash_string_pointer(char *s);
+#define SH_PREFIX		manifestfiles
+#define SH_ELEMENT_TYPE	manifestfile
+#define SH_KEY_TYPE		char *
+#define	SH_KEY			pathname
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * All of the context information we need while checking a backup manifest.
+ */
+typedef struct validator_context
+{
+	manifestfiles_hash *ht;
+	char	   *backup_directory;
+	SimpleStringList ignore_list;
+	bool		exit_on_error;
+	bool		saw_any_error;
+} validator_context;
+
+static manifestfiles_hash * parse_manifest_file(char *manifest_path);
+static void parse_file_line_from_manifest(manifestfile *f, char *rest,
+										  int restlen);
+static void validate_backup_directory(validator_context *context,
+									  char *relpath, char *fullpath);
+static void validate_backup_file(validator_context *context,
+								 char *relpath, char *fullpath);
+static void report_extra_backup_files(validator_context *context);
+static void validate_backup_checksums(validator_context *context);
+static void validate_file_checksum(validator_context *context,
+								   manifestfile *tabent, char *pathname);
+
+static void pg_validator_error(validator_context *context,
+							   const char *pg_restrict fmt,...)
+			pg_attribute_printf(2, 3);
+static bool should_ignore_relpath(validator_context *context, char *relpath);
+
+static char *extractstr(char *buffer, int length);
+static int	findchar(char *buffer, int size, char c, int start_position);
+static int	findfield(char *buffer, char *end, char **result);
+static int	hexdecode_char(char c);
+static bool hexdecode_string(uint8 *result, char *input, int nbytes);
+static void usage(void);
+
+static const char *progname;
+
+/*
+ * Main entry point.
+ */
+int
+main(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"exit-on-error", no_argument, NULL, 'e'},
+		{"ignore", required_argument, NULL, 'i'},
+		{"manifest-path", required_argument, NULL, 'm'},
+		{"quiet", no_argument, NULL, 'q'},
+		{"skip-checksums", no_argument, NULL, 's'},
+		{NULL, 0, NULL, 0}
+	};
+
+	int			c;
+	validator_context context;
+	char	   *manifest_path = NULL;
+	bool		quiet = false;
+	bool		skip_checksums = false;
+
+	pg_logging_init(argv[0]);
+	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_validatebackup"));
+	progname = get_progname(argv[0]);
+
+	memset(&context, 0, sizeof(context));
+
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			usage();
+			exit(0);
+		}
+		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+		{
+			puts("pg_validatebackup (PostgreSQL) " PG_VERSION);
+			exit(0);
+		}
+	}
+
+	/* Always ignore backup_manifest file and pg_wal directory. */
+	simple_string_list_append(&context.ignore_list, "backup_manifest");
+	simple_string_list_append(&context.ignore_list, "pg_wal");
+
+	while ((c = getopt_long(argc, argv, "ei:m:qs", long_options, NULL)) != -1)
+	{
+		switch (c)
+		{
+			case 'e':
+				context.exit_on_error = true;
+				break;
+			case 'i':
+				simple_string_list_append(&context.ignore_list, optarg);
+				break;
+			case 'm':
+				manifest_path = optarg;
+				break;
+			case 'q':
+				quiet = true;
+				break;
+			case 's':
+				skip_checksums = true;
+				break;
+			default:
+				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+						progname);
+				exit(1);
+		}
+	}
+
+	/* Get backup directory name */
+	if (optind >= argc)
+	{
+		pg_log_fatal("no backup directory specified");
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit(1);
+	}
+	context.backup_directory = argv[optind++];
+
+	/* Complain if any arguments remain */
+	if (optind < argc)
+	{
+		pg_log_fatal("too many command-line arguments (first is \"%s\")",
+					 argv[optind]);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit(1);
+	}
+
+	/* By default, look for the manifest in the backup directory. */
+	if (manifest_path == NULL)
+		manifest_path = psprintf("%s/backup_manifest",
+								 context.backup_directory);
+
+	/*
+	 * Try to read the manifest. We treat any errors encountered while parsing
+	 * the manifest as fatal; there doesn't seem to be much point in trying to
+	 * validate the backup directory against a corrupted manifest.
+	 */
+	context.ht = parse_manifest_file(manifest_path);
+
+	/*
+	 * Now scan the files in the backup directory. At this stage, we verify
+	 * that every file on disk is present in the manifest and that the sizes
+	 * match. We also set the "matched" flag on every manifest entry that
+	 * corresponds to a file on disk.
+	 */
+	validate_backup_directory(&context, NULL, context.backup_directory);
+
+	/*
+	 * The "matched" flag should now be set on every entry in the hash table.
+	 * Any entries for which the bit is not set are files mentioned in the
+	 * manifest that don't exist on disk.
+	 */
+	report_extra_backup_files(&context);
+
+	/*
+	 * Finally, do the expensive work of verifying file checksums, unless we
+	 * were told to skip it.
+	 */
+	if (!skip_checksums)
+		validate_backup_checksums(&context);
+
+	/*
+	 * If everything looks OK, tell the user this, unless we were asked to
+	 * work quietly.
+	 */
+	if (!context.saw_any_error && !quiet)
+		pg_log_info("backup successfully verified");
+
+	exit(context.saw_any_error ? 1 : 0);
+}
+
+/*
+ * Parse a manifest file and construct a hash table with information about
+ * all the files it mentions.
+ */
+static manifestfiles_hash *
+parse_manifest_file(char *manifest_path)
+{
+	int			fd;
+	struct stat statbuf;
+	off_t		estimate;
+	off_t		bytes_read = 0;
+	off_t		bytes_consumed = 0;
+	uint32		initial_size;
+	manifestfiles_hash *ht;
+	char	   *buffer;
+	uint8		manifest_checksum_actual[PG_SHA256_DIGEST_LENGTH];
+	uint8		manifest_checksum_expected[PG_SHA256_DIGEST_LENGTH];
+	int			buffer_position = 0;
+	int			buffer_size = 0;
+	int			buffer_maxsize = 2 * READ_CHUNK_SIZE;
+	int			line_number = 0;
+	bool		saw_manifest_checksum_line = false;
+	pg_sha256_ctx manifest_ctx;
+
+	/* Prepare to compute a checksum of the manifest itself. */
+	pg_sha256_init(&manifest_ctx);
+
+	/* Open the manifest file. */
+	if ((fd = open(manifest_path, O_RDONLY | PG_BINARY, 0)) < 0)
+	{
+		pg_log_fatal("could not open file \"%s\": %m", manifest_path);
+		exit(1);
+	}
+
+	/* Figure out how big the manifest is. */
+	if (fstat(fd, &statbuf) != 0)
+	{
+		pg_log_fatal("could not stat file \"%s\": %m", manifest_path);
+		exit(1);
+	}
+
+	/* Guess how large to make the hash table based on the manifest size. */
+	estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE;
+	initial_size = Min(PG_UINT32_MAX, Max(estimate, 256));
+
+	/* Create the hash table. */
+	ht = manifestfiles_create(initial_size, NULL);
+
+	/* Initialize our read buffer. */
+	buffer = pg_malloc(buffer_maxsize);
+
+	/*
+	 * Loop until we've read it all.
+	 *
+	 * The file size shouldn't be changing, so it seems fine to just error out
+	 * if the final length is different from what stat() told us.
+	 */
+	while (bytes_consumed < statbuf.st_size)
+	{
+		int			line_length;
+		int			first_field_length;
+		char	   *rest;
+		int			restlen;
+
+		/* Find next newline if any. */
+		line_length = findchar(buffer, buffer_size, '\n', buffer_position);
+
+		/* If no newline was found, we need to read more data and try again. */
+		if (line_length == -1)
+		{
+			size_t		bytes_to_read;
+			int			rc;
+
+			bytes_to_read = Min(statbuf.st_size - bytes_read, READ_CHUNK_SIZE);
+			if (bytes_to_read == 0)
+			{
+				pg_log_fatal("manifest file line not terminated by newline");
+				exit(1);
+			}
+			if (bytes_to_read + READ_CHUNK_SIZE > buffer_maxsize)
+			{
+				buffer_maxsize += READ_CHUNK_SIZE;
+				buffer = pg_realloc(buffer, buffer_maxsize);
+				Assert(bytes_to_read + READ_CHUNK_SIZE <= buffer_maxsize);
+			}
+			rc = read(fd, buffer + buffer_size, bytes_to_read);
+			if (rc != bytes_to_read)
+			{
+				if (rc < 0)
+					pg_log_fatal("could not read file \"%s\": %m",
+								 manifest_path);
+				else
+					pg_log_fatal("could not read file \"%s\": read %d of %zu",
+								 manifest_path, rc, bytes_to_read);
+				exit(1);
+			}
+			buffer_size += rc;
+			bytes_read += rc;
+			continue;
+		}
+
+		/* Increment line number. */
+		++line_number;
+
+		/* The manifest checksum should be the last thing in the file. */
+		if (saw_manifest_checksum_line)
+		{
+			pg_log_fatal("unexpected data follows manifest checksum");
+			exit(1);
+		}
+
+		/* Find first field on line, and remaining line contents. */
+		first_field_length =
+			findchar(buffer, buffer_size, '\t', buffer_position);
+		rest = buffer + buffer_position + first_field_length + 1;
+		restlen = line_length - (first_field_length + 1);
+
+		/*
+		 * Check the first word of the line to see what kind of line it is.
+		 */
+		if (first_field_length == KWL_MANIFEST_VERSION &&
+			memcmp(buffer + buffer_position, KW_MANIFEST_VERSION,
+				   KWL_MANIFEST_VERSION) == 0)
+		{
+			if (line_number != 1)
+			{
+				pg_log_fatal("manifest file version should only be specified at line 1");
+				exit(1);
+			}
+			else
+			{
+				char	   *line = buffer + buffer_position;
+				char	   *version;
+
+				version = extractstr(line + first_field_length + 1,
+									 line_length - (first_field_length + 1));
+				if (strcmp(version, "1") != 0)
+				{
+					pg_log_fatal("unrecognized manifest version: \"%s\"",
+								 version);
+					exit(1);
+				}
+			}
+		}
+		else if (first_field_length == KWL_MANIFEST_FILE &&
+				 memcmp(buffer + buffer_position, KW_MANIFEST_FILE,
+						KWL_MANIFEST_FILE) == 0)
+		{
+			manifestfile f;
+			manifestfile *tabent;
+			bool		found;
+
+			/* Parse this line. */
+			parse_file_line_from_manifest(&f, rest, restlen);
+
+			/* Make a new entry in the hash table for it. */
+			tabent = manifestfiles_insert(ht, f.pathname, &found);
+			if (found)
+			{
+				pg_log_fatal("duplicate pathname in backup manifest: \"%s\"",
+							 f.pathname);
+				exit(1);
+			}
+
+			/* Copy in all the relevant details. */
+			tabent->size = f.size;
+			tabent->checksum_type = f.checksum_type;
+			tabent->checksum_length = f.checksum_length;
+			tabent->checksum_payload = f.checksum_payload;
+			tabent->matched = false;
+			tabent->bad = false;
+		}
+		else if (first_field_length == KWL_MANIFEST_CHECKSUM &&
+				 memcmp(buffer + buffer_position, KW_MANIFEST_CHECKSUM,
+						KWL_MANIFEST_CHECKSUM) == 0)
+		{
+			saw_manifest_checksum_line = true;
+			if (restlen != PG_SHA256_DIGEST_STRING_LENGTH - 1)
+			{
+				pg_log_fatal("manifest file checksum has unexpected length: %d",
+							 restlen);
+				exit(1);
+			}
+			if (!hexdecode_string(manifest_checksum_expected, rest,
+								  PG_SHA256_DIGEST_LENGTH))
+			{
+				pg_log_fatal("invalid manifest checksum: \"%s\"",
+							 extractstr(rest, restlen));
+				exit(1);
+			}
+		}
+		else if (first_field_length == -1)
+		{
+			pg_log_fatal("manifest file keyword not terminated by tab");
+			exit(1);
+		}
+		else
+		{
+			char	   *kw;
+
+			kw = extractstr(buffer + buffer_position, first_field_length);
+			pg_log_fatal("unrecognized manifest file keyword: \"%s\"", kw);
+			exit(1);
+		}
+
+		/* Update manifest checksum, if needed. */
+		if (!saw_manifest_checksum_line)
+			pg_sha256_update(&manifest_ctx, (uint8 *) buffer + buffer_position,
+							 line_length + 1);
+
+		/* Advance buffer position over the data we just read. */
+		buffer_position += line_length + 1;
+
+		/* Also mark these bytes as consumed so we know when to stop. */
+		bytes_consumed += line_length + 1;
+
+		/*
+		 * We don't want to incur the expensive of using memmove() to discard
+		 * data after every line, because the lines are short compared to the
+		 * chunk size -- but we must do it at least now and then, or we'll
+		 * have to keep growing the buffer.
+		 */
+		if (buffer_position >= READ_CHUNK_SIZE)
+		{
+			int			leftover_bytes = buffer_size - buffer_position;
+
+			if (leftover_bytes > 0)
+				memmove(buffer, buffer + buffer_position, leftover_bytes);
+			buffer_size -= buffer_position;
+			buffer_position = 0;
+		}
+	}
+
+	/* Checksum verification. */
+	if (!saw_manifest_checksum_line)
+		pg_log_fatal("manifest has no checksum");
+	pg_sha256_final(&manifest_ctx, manifest_checksum_actual);
+	if (memcmp(manifest_checksum_actual, manifest_checksum_expected,
+			   PG_SHA256_DIGEST_LENGTH) != 0)
+	{
+		pg_log_fatal("manifest checksum does not match");
+		exit(1);
+	}
+
+	/* OK, we're done with the manifest file. */
+	close(fd);
+
+	/* Return the hash table we constructed. */
+	return ht;
+}
+
+/*
+ * The caller passes the remainder of the line, excluding the initial "File\t"
+ * portion.
+ */
+static void
+parse_file_line_from_manifest(manifestfile *f, char *rest, int restlen)
+{
+	char	   *end = rest + restlen;
+	char	   *field[FIELDS_PER_FILE_LINE];
+	unsigned long filesize;
+	char	   *ep;
+	pg_checksum_type checksum_type;
+	int			raw_checksum_length = 0;
+	char	   *raw_checksum_payload = NULL;
+	int			checksum_length;
+	uint8	   *checksum_payload;
+	int			i;
+	char	   *s;
+
+	/* Split the line into fields. */
+	for (i = 0; i < FIELDS_PER_FILE_LINE; ++i)
+	{
+		int			toklen;
+
+		toklen = findfield(rest, end, &field[i]);
+		if (rest + toklen >= end && i + 1 < FIELDS_PER_FILE_LINE)
+		{
+			pg_log_fatal("manifest file line has too few fields");
+			exit(1);
+		}
+		rest += toklen + 1;
+	}
+
+	/* We expect to have used the entire line. */
+	if (rest < end)
+	{
+		pg_log_fatal("manifest file line has too many fields");
+		exit(1);
+	}
+
+	/* Parse the size. */
+	filesize = strtoul(field[1], &ep, 10);
+	if (*ep)
+	{
+		pg_log_fatal("manifest file size for file \"%s\" is not a number",
+					 field[0]);
+		exit(1);
+	}
+
+	/* Parse the checksum type. */
+	for (s = field[3]; s[0] != '\0' && s[0] != ':'; ++s)
+		;
+	if (*s)
+	{
+		raw_checksum_payload = s + 1;
+		raw_checksum_length = strlen(raw_checksum_payload);
+		*s = '\0';
+	}
+	if (!pg_checksum_parse_type(field[3], &checksum_type))
+	{
+		pg_log_fatal("unrecognized checksum algorithm for file \"%s\": \"%s\"",
+					 field[0], field[3]);
+		exit(1);
+	}
+
+	/* Decode the checksum payload. */
+	checksum_length = raw_checksum_length / 2;
+	if (checksum_length == 0)
+		checksum_payload = NULL;
+	else
+	{
+		checksum_payload = palloc(checksum_length);
+		if (!hexdecode_string(checksum_payload, raw_checksum_payload,
+							  checksum_length))
+		{
+			pg_log_fatal("invalid checksum for file \"%s\": \"%s\"",
+						 field[0], raw_checksum_payload);
+			exit(1);
+		}
+	}
+
+	/* Fill the output struct. */
+	f->pathname = field[0];
+	f->size = filesize;
+	f->checksum_type = checksum_type;
+	f->checksum_length = checksum_length;
+	f->checksum_payload = checksum_payload;
+}
+
+/*
+ * Validate one directory.
+ *
+ * 'relpath' is NULL if we are to validate the top-level backup directory,
+ * and otherwise the relative path to the directory that is to be validated.
+ *
+ * 'fullpath' is the backup directory with 'relpath' appended; i.e. the actual
+ * filesystem path at which it can be found.
+ */
+static void
+validate_backup_directory(validator_context *context, char *relpath,
+						  char *fullpath)
+{
+	DIR		   *dir;
+	struct dirent *dirent;
+
+	dir = opendir(fullpath);
+	if (dir == NULL)
+	{
+		pg_validator_error(context,
+						   "could not open directory \"%s\": %m", fullpath);
+
+		/*
+		 * Suppress further errors related to this path name and anything
+		 * underneath it.
+		 */
+		simple_string_list_append(&context->ignore_list, relpath);
+
+		return;
+	}
+
+	while (errno = 0, (dirent = readdir(dir)) != NULL)
+	{
+		char	   *filename = dirent->d_name;
+		char	   *newfullpath = psprintf("%s/%s", fullpath, filename);
+		char	   *newrelpath;
+
+		/* Skip "." and ".." */
+		if (filename[0] == '.' && (filename[1] == '\0'
+								   || strcmp(filename, "..") == 0))
+			continue;
+
+		if (relpath == NULL)
+			newrelpath = pstrdup(filename);
+		else
+			newrelpath = psprintf("%s/%s", relpath, filename);
+
+		if (!should_ignore_relpath(context, newrelpath))
+			validate_backup_file(context, newrelpath, newfullpath);
+
+		pfree(newfullpath);
+		pfree(newrelpath);
+	}
+
+	if (closedir(dir))
+	{
+		pg_validator_error(context,
+						   "could not close directory \"%s\": %m", fullpath);
+		return;
+	}
+}
+
+/*
+ * Validate one file (which might actually be a directory or a symlink).
+ *
+ * The arguments to this function have the same meaning as the arguments to
+ * validate_backup_directory.
+ */
+static void
+validate_backup_file(validator_context *context, char *relpath, char *fullpath)
+{
+	struct stat sb;
+	manifestfile *tabent;
+
+	if (stat(fullpath, &sb) != 0)
+	{
+		pg_validator_error(context,
+						   "could not stat file or directory \"%s\": %m",
+						   relpath);
+
+		/*
+		 * Suppress further errors related to this path name and, if it's a
+		 * directory, anything underneath it.
+		 */
+		simple_string_list_append(&context->ignore_list, relpath);
+
+		return;
+	}
+
+	/* If it's a directory, just recurse. */
+	if (S_ISDIR(sb.st_mode))
+	{
+		validate_backup_directory(context, relpath, fullpath);
+		return;
+	}
+
+	/* If it's not a directory, it should be a plain file. */
+	if (!S_ISREG(sb.st_mode))
+	{
+		pg_validator_error(context,
+						   "\"%s\" is not a file or directory",
+						   relpath);
+		return;
+	}
+
+	/* Check whether there's an entry in the manifest hash. */
+	tabent = manifestfiles_lookup(context->ht, relpath);
+	if (tabent == NULL)
+	{
+		pg_validator_error(context,
+						   "\"%s\" is present on disk but not in the manifest",
+						   relpath);
+		return;
+	}
+
+	/* Flag this entry as having been encountered in the filesystem. */
+	tabent->matched = true;
+
+	/* Check that the size matches. */
+	if (tabent->size != sb.st_size)
+	{
+		pg_validator_error(context,
+						   "\"%s\" has size %zu on disk but size %zu in the manifest",
+						   relpath, (size_t) sb.st_size, tabent->size);
+		tabent->bad = true;
+	}
+
+	/*
+	 * We don't validate checksums at this stage. We first finish validating
+	 * that we have the expected set of files with the expected sizes, and
+	 * only afterwards verify the checksums. That's because computing
+	 * checksums may take a while, and we'd like to report more obvious
+	 * problems quickly.
+	 */
+}
+
+/*
+ * Scan the hash table for entries where the 'matched' flag is not set; report
+ * that such files are present in the manifest but not on disk.
+ */
+static void
+report_extra_backup_files(validator_context *context)
+{
+	manifestfiles_iterator it;
+	manifestfile *tabent;
+
+	manifestfiles_start_iterate(context->ht, &it);
+	while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL)
+		if (!tabent->matched &&
+			!should_ignore_relpath(context, tabent->pathname))
+			pg_validator_error(context,
+							   "\"%s\" is present in the manifest but not on disk",
+							   tabent->pathname);
+}
+
+/*
+ * Validate checksums for hash table entries that are otherwise unproblematic.
+ * If we've already reported some problem related to a hash table entry, or
+ * if it has no checksum, just skip it.
+ */
+static void
+validate_backup_checksums(validator_context *context)
+{
+	manifestfiles_iterator it;
+	manifestfile *tabent;
+
+	manifestfiles_start_iterate(context->ht, &it);
+	while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL)
+	{
+		if (tabent->matched && !tabent->bad &&
+			tabent->checksum_type != CHECKSUM_TYPE_NONE &&
+			!should_ignore_relpath(context, tabent->pathname))
+		{
+			char	   *fullpath;
+
+			/* Compute the full pathname to the target file. */
+			fullpath = psprintf("%s/%s", context->backup_directory,
+								tabent->pathname);
+
+			/* Do the actual checksum validation. */
+			validate_file_checksum(context, tabent, fullpath);
+
+			/* Avoid leaking memory. */
+			pfree(fullpath);
+		}
+	}
+}
+
+/*
+ * Validate the checksum of a single file.
+ */
+static void
+validate_file_checksum(validator_context *context, manifestfile *tabent,
+					   char *fullpath)
+{
+	pg_checksum_context checksum_ctx;
+	char	   *relpath = tabent->pathname;
+	int			fd;
+	int			rc;
+	uint8		buffer[READ_CHUNK_SIZE];
+	uint8		checksumbuf[PG_CHECKSUM_MAX_LENGTH];
+	int			checksumlen;
+
+	/* Open the target file. */
+	if ((fd = open(fullpath, O_RDONLY, 0)) < 0)
+	{
+		pg_validator_error(context, "could not open file \"%s\": %m",
+						   relpath);
+		pfree(fullpath);
+		return;
+	}
+
+	/* Initialize checksum context. */
+	pg_checksum_init(&checksum_ctx, tabent->checksum_type);
+
+	/* Read the file chunk by chunk, updating the checksum as we go. */
+	while ((rc = read(fd, buffer, READ_CHUNK_SIZE)) > 0)
+		pg_checksum_update(&checksum_ctx, buffer, rc);
+	if (rc < 0)
+		pg_validator_error(context, "could not read file \"%s\": %m",
+						   relpath);
+
+	/* Close the file. */
+	if (close(fd) != 0)
+	{
+		pg_validator_error(context, "could not close file \"%s\": %m",
+						   relpath);
+		pfree(fullpath);
+		return;
+	}
+
+	/* If we didn't manage to read the whole file, bail out now. */
+	if (rc < 0)
+		return;
+
+	/* Get the final checksum. */
+	checksumlen = pg_checksum_final(&checksum_ctx, checksumbuf);
+
+	/* And check it against the manifest. */
+	if (checksumlen != tabent->checksum_length)
+		pg_validator_error(context,
+						   "file \"%s\" has checksum of length %d, but expected %d",
+						   relpath, tabent->checksum_length, checksumlen);
+	else if (memcmp(checksumbuf, tabent->checksum_payload, checksumlen) != 0)
+		pg_validator_error(context,
+						   "checksum mismatch for file \"%s\"",
+						   relpath);
+}
+
+/*
+ * Print out usage information and exit.
+ */
+static void
+usage(void)
+{
+	printf(_("%s validates a backup against the backup manifest.\n\n"), progname);
+	printf(_("Usage:\n  %s [OPTION]... BACKUPDIR\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -e, --exit-on-error         exit immediately on error\n"));
+	printf(_("  -i, --ignore=RELATIVE_PATH  ignore indicated path\n"));
+	printf(_("  -m, --manifest=PATH         use specified path for manifest\n"));
+	printf(_("  -s, --skip-checksums        skip checksum verification\n"));
+	printf(_("  -V, --version               output version information, then exit\n"));
+	printf(_("  -?, --help                  show this help, then exit\n"));
+	printf(_("\nReport bugs to <pgsql-bugs@lists.postgresql.org>.\n"));
+}
+
+/*
+ * Report an error. Update the context to indicate that we saw an error, and
+ * exit if the context says we should.
+ */
+static void
+pg_validator_error(validator_context *context, const char *pg_restrict fmt,...)
+{
+	va_list		ap;
+
+	va_start(ap, fmt);
+	pg_log_generic_v(PG_LOG_ERROR, fmt, ap);
+	va_end(ap);
+
+	context->saw_any_error = true;
+	if (context->exit_on_error)
+		exit(1);
+}
+
+/*
+ * Is the specified relative path, or some prefix of it, listed in the set
+ * of paths to ignore?
+ *
+ * Note that by "prefix" we mean a parent directory; for this purpose,
+ * "aa/bb" is not a prefix of "aa/bbb", but it is a prefix of "aa/bb/cc".
+ */
+static bool
+should_ignore_relpath(validator_context *context, char *relpath)
+{
+	SimpleStringListCell *cell;
+
+	for (cell = context->ignore_list.head; cell != NULL; cell = cell->next)
+	{
+		char	   *r = relpath;
+		char	   *v = cell->val;
+
+		while (*v != '\0' && *r == *v)
+			++r, ++v;
+
+		if (*v == '\0' && (*r == '\0' || *r == '/'))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Extract a NUL-terminated string from a larger buffer.
+ */
+static char *
+extractstr(char *buffer, int length)
+{
+	char	   *s = palloc(length + 1);
+
+	memcpy(s, buffer, length);
+	s[length] = '\0';
+
+	return s;
+}
+
+/*
+ * Find the next instance of a given character within a buffer that
+ * occurs at or after start_position. If there is none, returns -1; else
+ * returns the difference between the position at which the character was
+ * found and the start position.
+ */
+static int
+findchar(char *buffer, int size, char c, int start_position)
+{
+	int			i;
+
+	for (i = start_position; i < size; ++i)
+		if (buffer[i] == c)
+			return i - start_position;
+	return -1;
+}
+
+/*
+ * Extract the next field from a line of text read from the manifest file.
+ */
+static int
+findfield(char *buffer, char *end, char **result)
+{
+	int			qoffset = 1;
+	int			dqcount = 0;
+	int			toklen;
+	int			bufpos;
+	int			resultpos;
+
+	/*
+	 * If this field is unquoted, we just stop at the next tab; if there's
+	 * none, we stop at the end of the line. Note that if buffer == end, it
+	 * just means that the last field on the line is empty.
+	 */
+	if (buffer == end || *buffer != '"')
+	{
+		toklen = findchar(buffer, end - buffer, '\t', 0);
+
+		if (toklen == -1)
+			toklen = end - buffer;
+		*result = extractstr(buffer, toklen);
+		return toklen;
+	}
+
+	/*
+	 * Our escaping convention is that if the field contains a tab, it must be
+	 * surrounded by double-quotes and any internal double-quotes must be
+	 * doubled.
+	 */
+	while (1)
+	{
+		/* Where's the next double quote? */
+		qoffset += findchar(buffer, end - buffer, '"', qoffset);
+		if (qoffset == -1)
+		{
+			pg_log_fatal("quoted field in backup manifest is not terminated");
+			exit(1);
+		}
+
+		/*
+		 * If the double-quote we found is the last character on the line or
+		 * if it's followed by a tab, we've reached the end of this field.
+		 */
+		if (buffer + qoffset >= end || buffer[qoffset + 1] == '\t')
+			break;
+
+		/* Otherwise, the next character should be another double-quote. */
+		if (buffer[qoffset + 1] != '"')
+		{
+			pg_log_fatal("invalid quoted field in backup manifest");
+			exit(1);
+		}
+
+		/* Skip both double-quotes and go around again. */
+		qoffset += 2;
+		++dqcount;
+	}
+
+	/*
+	 * At this point, we know that qoffset is the offset, relative to buffer,
+	 * of the closing double-quote, and that dqcount is the number of escaped
+	 * double-quotes within the field, and that all of those escape sequences
+	 * are proper. Extract and de-escape the data in the field.
+	 *
+	 * The amount of space needed for the result is equal to the raw token
+	 * length, minus two for the double quotes at the start and end, minus one
+	 * for each doubled double-quote within the token, plus one for the
+	 * trailing zero byte.
+	 */
+	toklen = qoffset + 1;
+	*result = palloc(toklen - dqcount - 1);
+	bufpos = 1;
+	resultpos = 0;
+	while (bufpos < qoffset)
+	{
+		(*result)[resultpos] = buffer[bufpos];
+		bufpos += (buffer[bufpos] == '"' ? 2 : 1);
+		++resultpos;
+	}
+	(*result)[resultpos] = '\0';
+	Assert(resultpos == toklen - dqcount - 2);
+
+	return toklen;
+}
+
+/*
+ * Helper function for manifestfiles hash table.
+ */
+static uint32
+hash_string_pointer(char *s)
+{
+	unsigned char *ss = (unsigned char *) s;
+
+	return hash_bytes(ss, strlen(s));
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+hexdecode_char(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+static bool
+hexdecode_string(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = hexdecode_char(input[i * 2]);
+		int			n2 = hexdecode_char(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
-- 
2.17.2 (Apple Git-113)