From 30952505a3428a161f94d720cd08cae9d06afbde Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Mon, 9 Mar 2020 15:23:24 -0400 Subject: [PATCH v11 3/5] pg_validatebackup: Validate a backup against the backup manifest. Patch by me; review by Tushar Ahuja and Rajkumar Raghuwanshi, and also off-list by Mark Dilger, Davinder Singh, and Jeevan Chalke. --- src/bin/Makefile | 1 + src/bin/pg_validatebackup/.gitignore | 1 + src/bin/pg_validatebackup/Makefile | 33 + src/bin/pg_validatebackup/parse_manifest.c | 547 +++++++++++++ src/bin/pg_validatebackup/parse_manifest.h | 40 + src/bin/pg_validatebackup/pg_validatebackup.c | 732 ++++++++++++++++++ 6 files changed, 1354 insertions(+) create mode 100644 src/bin/pg_validatebackup/.gitignore create mode 100644 src/bin/pg_validatebackup/Makefile create mode 100644 src/bin/pg_validatebackup/parse_manifest.c create mode 100644 src/bin/pg_validatebackup/parse_manifest.h create mode 100644 src/bin/pg_validatebackup/pg_validatebackup.c diff --git a/src/bin/Makefile b/src/bin/Makefile index 7f4120a34f..77bceea4fe 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -27,6 +27,7 @@ SUBDIRS = \ pg_test_fsync \ pg_test_timing \ pg_upgrade \ + pg_validatebackup \ pg_waldump \ pgbench \ psql \ diff --git a/src/bin/pg_validatebackup/.gitignore b/src/bin/pg_validatebackup/.gitignore new file mode 100644 index 0000000000..3ae1c1f03a --- /dev/null +++ b/src/bin/pg_validatebackup/.gitignore @@ -0,0 +1 @@ +/pg_validatebackup diff --git a/src/bin/pg_validatebackup/Makefile b/src/bin/pg_validatebackup/Makefile new file mode 100644 index 0000000000..dde7eb3c02 --- /dev/null +++ b/src/bin/pg_validatebackup/Makefile @@ -0,0 +1,33 @@ +# src/bin/pg_validatebackup/Makefile + +PGFILEDESC = "pg_validatebackup - validate a backup against a backup manifest" +PGAPPICON = win32 + +subdir = src/bin/pg_validatebackup +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +# We need libpq only because fe_utils does. +LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) + +OBJS = \ + $(WIN32RES) \ + parse_manifest.o \ + pg_validatebackup.o + +all: pg_validatebackup + +pg_validatebackup: $(OBJS) | submake-libpq submake-libpgport submake-libpgfeutils + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) pg_validatebackup$(X) '$(DESTDIR)$(bindir)/pg_validatebackup$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_validatebackup$(X)' + +clean distclean maintainer-clean: + rm -f pg_validatebackup$(X) $(OBJS) diff --git a/src/bin/pg_validatebackup/parse_manifest.c b/src/bin/pg_validatebackup/parse_manifest.c new file mode 100644 index 0000000000..b8ee889c50 --- /dev/null +++ b/src/bin/pg_validatebackup/parse_manifest.c @@ -0,0 +1,547 @@ +/*------------------------------------------------------------------------- + * + * parse_manifest.c + * Parse a backup manifest in JSON format. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_validatebackup/parse_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include "parse_manifest.h" +#include "common/jsonapi.h" + +/* + * Semantic states for JSON manifest parsing. + */ +typedef enum +{ + JM_EXPECT_TOPLEVEL_START, + JM_EXPECT_TOPLEVEL_END, + JM_EXPECT_VERSION_FIELD, + JM_EXPECT_VERSION_VALUE, + JM_EXPECT_FILES_FIELD, + JM_EXPECT_FILES_ARRAY_START, + JM_EXPECT_FILES_ARRAY_NEXT, + JM_EXPECT_THIS_FILE_FIELD, + JM_EXPECT_THIS_FILE_VALUE, + JM_EXPECT_MANIFEST_CHECKSUM_FIELD, + JM_EXPECT_MANIFEST_CHECKSUM_VALUE, + JM_EXPECT_EOF +} JsonManifestSemanticState; + +/* + * Possible fields for one file as described by the manifest. + */ +typedef enum +{ + JMFF_PATH, + JMFF_SIZE, + JMFF_LAST_MODIFIED, + JMFF_CHECKSUM_ALGORITHM, + JMFF_CHECKSUM +} JsonManifestFileField; + +/* + * Internal state used while decoding the JSON-format backup manifest. + */ +typedef struct +{ + JsonManifestParseContext *context; + JsonManifestSemanticState state; + JsonManifestFileField field; + char *pathname; + char *size; + char *algorithm; + pg_checksum_type checksum_algorithm; + char *checksum; + char *manifest_checksum; +} JsonManifestParseState; + +static void json_manifest_object_start(void *state); +static void json_manifest_object_end(void *state); +static void json_manifest_array_start(void *state); +static void json_manifest_array_end(void *state); +static void json_manifest_object_field_start(void *state, char *fname, + bool isnull); +static void json_manifest_scalar(void *state, char *token, + JsonTokenType tokentype); +static void json_manifest_finalize_file(JsonManifestParseState *parse); +static void verify_manifest_checksum(JsonManifestParseState *parse, + char *buffer, size_t size); +static void json_manifest_parse_failure(JsonManifestParseContext *context, + char *msg); + +static int hexdecode_char(char c); +static bool hexdecode_string(uint8 *result, char *input, int nbytes); + +/* + * Main entrypoint to parse a JSON-format backup manifest. + * + * Caller should set up the parsing context and then invoke this function. + * For each file whose information is extracted from the manifest, + * context->perfile_cb is invoked. In case of trouble, context->error_cb is + * invoked and is expected not to return. + */ +void +json_parse_manifest(JsonManifestParseContext *context, char *buffer, + size_t size) +{ + JsonLexContext *lex; + JsonParseErrorType json_error; + JsonSemAction sem; + JsonManifestParseState parse; + + /* Set up our private parsing context. */ + parse.state = JM_EXPECT_TOPLEVEL_START; + parse.context = context; + + /* Create a JSON lexing context. */ + lex = makeJsonLexContextCstringLen(buffer, size, PG_UTF8, true); + + /* Set up semantic actions. */ + sem.semstate = &parse; + sem.object_start = json_manifest_object_start; + sem.object_end = json_manifest_object_end; + sem.array_start = json_manifest_array_start; + sem.array_end = json_manifest_array_end; + sem.object_field_start = json_manifest_object_field_start; + sem.object_field_end = NULL; + sem.array_element_start = NULL; + sem.array_element_end = NULL; + sem.scalar = json_manifest_scalar; + + /* Run the actual JSON parser. */ + json_error = pg_parse_json(lex, &sem); + if (json_error != JSON_SUCCESS) + json_manifest_parse_failure(context, json_errdetail(json_error, lex)); + if (parse.state != JM_EXPECT_EOF) + json_manifest_parse_failure(context, "manifest ended unexpectedly"); + + /* Validate the checksum. */ + verify_manifest_checksum(&parse, buffer, size); +} + +/* + * Invoked at the start of each object in the JSON document. + * + * The document as a whole is expected to be an object with three keys + * (PostgreSQL-Backup-Manifest-Version, Files, Manifest-Checksum) and each + * file is expected to be an object with various keys (Path, Size, etc.). + * If we're not at the beginning of either the toplevel object or the object + * for a particular file, it's an error. + */ +static void +json_manifest_object_start(void *state) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_TOPLEVEL_START: + parse->state = JM_EXPECT_VERSION_FIELD; + break; + case JM_EXPECT_FILES_ARRAY_NEXT: + parse->state = JM_EXPECT_THIS_FILE_FIELD; + parse->pathname = NULL; + parse->size = NULL; + parse->algorithm = NULL; + parse->checksum = NULL; + break; + default: + json_manifest_parse_failure(parse->context, + "unexpected object start"); + break; + } +} + +/* + * Invoked at the end of each object in the JSON document. + * + * The possible cases here are the same as for json_manifest_object_start. + * There's nothing special to do at the end of the document, but when we + * reach the end of an object representing a particular file, we must call + * json_manifest_finalize_file() to save the associated details. + */ +static void +json_manifest_object_end(void *state) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_TOPLEVEL_END: + parse->state = JM_EXPECT_EOF; + break; + case JM_EXPECT_THIS_FILE_FIELD: + json_manifest_finalize_file(parse); + parse->state = JM_EXPECT_FILES_ARRAY_NEXT; + break; + default: + json_manifest_parse_failure(parse->context, + "unexpected object end"); + break; + } +} + +/* + * Invoked at the start of each array in the JSON document. + * + * Within the toplevel object, the value associated with the "Files" key + * should be an array. No other arrays are expected. + */ +static void +json_manifest_array_start(void *state) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_FILES_ARRAY_START: + parse->state = JM_EXPECT_FILES_ARRAY_NEXT; + break; + default: + json_manifest_parse_failure(parse->context, + "unexpected array start"); + break; + } +} + +/* + * Invoked at the end of each array in the JSON document. + * + * Just like json_manifest_array_start, there's only one expected case + * here. + */ +static void +json_manifest_array_end(void *state) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_FILES_ARRAY_NEXT: + parse->state = JM_EXPECT_MANIFEST_CHECKSUM_FIELD; + break; + default: + json_manifest_parse_failure(parse->context, + "unexpected array end"); + break; + } +} + +/* + * Invoked at the start of each object field in the JSON document. + */ +static void +json_manifest_object_field_start(void *state, char *fname, bool isnull) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_VERSION_FIELD: + /* Inside toplevel object, expecting version indicator. */ + if (strcmp(fname, "PostgreSQL-Backup-Manifest-Version") != 0) + json_manifest_parse_failure(parse->context, + "expected version indicator"); + parse->state = JM_EXPECT_VERSION_VALUE; + break; + case JM_EXPECT_FILES_FIELD: + /* Inside toplevel object, expecting "Files" next. */ + if (strcmp(fname, "Files") != 0) + json_manifest_parse_failure(parse->context, + "expected file list"); + parse->state = JM_EXPECT_FILES_ARRAY_START; + break; + case JM_EXPECT_THIS_FILE_FIELD: + /* Inside object for one file; which key have we got? */ + if (strcmp(fname, "Path") == 0) + parse->field = JMFF_PATH; + else if (strcmp(fname, "Size") == 0) + parse->field = JMFF_SIZE; + else if (strcmp(fname, "Last-Modified") == 0) + parse->field = JMFF_LAST_MODIFIED; + else if (strcmp(fname, "Checksum-Algorithm") == 0) + parse->field = JMFF_CHECKSUM_ALGORITHM; + else if (strcmp(fname, "Checksum") == 0) + parse->field = JMFF_CHECKSUM; + else + json_manifest_parse_failure(parse->context, + "unexpected file field"); + parse->state = JM_EXPECT_THIS_FILE_VALUE; + break; + case JM_EXPECT_MANIFEST_CHECKSUM_FIELD: + /* Inside toplevel object, expecting "Manifest-Checksum" next. */ + if (strcmp(fname, "Manifest-Checksum") != 0) + json_manifest_parse_failure(parse->context, + "expected manifest checksum"); + parse->state = JM_EXPECT_MANIFEST_CHECKSUM_VALUE; + break; + default: + json_manifest_parse_failure(parse->context, + "unexpected object field"); + break; + } +} + +/* + * Invoked at the start of each scalar in the JSON document. + * + * Object field names don't reach this code; those are handled by + * json_manifest_object_field_start. When we're inside of the object for + * a particular file, that function will have noticed the name of the field, + * and we'll get the corresponding value here. When we're in the toplevel + * object, the parse state itself tells us which field this is. + * + * In all cases except for PostgreSQL-Backup-Manifest-Version, which we + * can just check on the spot, the goal here is just to save the value in + * the parse state for later use. We don't actually do anything until we + * reach either the end of the object representing this file, or the end + * of the manifest, as the case may be. + */ +static void +json_manifest_scalar(void *state, char *token, JsonTokenType tokentype) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_VERSION_VALUE: + if (strcmp(token, "1") != 0) + json_manifest_parse_failure(parse->context, + "unexpected manifest version"); + parse->state = JM_EXPECT_FILES_FIELD; + break; + case JM_EXPECT_THIS_FILE_VALUE: + switch (parse->field) + { + case JMFF_PATH: + parse->pathname = token; + break; + case JMFF_SIZE: + parse->size = token; + break; + case JMFF_LAST_MODIFIED: + pfree(token); /* unused */ + break; + case JMFF_CHECKSUM_ALGORITHM: + parse->algorithm = token; + break; + case JMFF_CHECKSUM: + parse->checksum = token; + break; + } + parse->state = JM_EXPECT_THIS_FILE_FIELD; + break; + case JM_EXPECT_MANIFEST_CHECKSUM_VALUE: + parse->state = JM_EXPECT_TOPLEVEL_END; + parse->manifest_checksum = token; + break; + default: + json_manifest_parse_failure(parse->context, "unexpected scalar"); + break; + } +} + +/* + * Do additional parsing and sanity-checking of the details gathered for one + * file, and invoke the per-file callback so that the caller gets those + * details. This happens for each file when the corresponding JSON object is + * completely parsed. + */ +static void +json_manifest_finalize_file(JsonManifestParseState *parse) +{ + JsonManifestParseContext *context = parse->context; + size_t size; + char *ep; + int checksum_string_length; + pg_checksum_type checksum_type; + int checksum_length; + uint8 *checksum_payload; + + /* Pathname and size are required. */ + if (parse->pathname == NULL) + json_manifest_parse_failure(parse->context, "missing pathname"); + if (parse->size == NULL) + json_manifest_parse_failure(parse->context, "missing size"); + if (parse->algorithm == NULL && parse->checksum != NULL) + json_manifest_parse_failure(parse->context, + "checksum without algorithm"); + + /* Parse size. */ + size = strtoul(parse->size, &ep, 10); + if (*ep) + json_manifest_parse_failure(parse->context, + "file size is not an integer"); + + /* Parse the checksum algorithm, if it's present. */ + if (parse->algorithm == NULL) + checksum_type = CHECKSUM_TYPE_NONE; + else if (!pg_checksum_parse_type(parse->algorithm, &checksum_type)) + context->error_cb(context, "unrecognized checksum algorithm: \"%s\"", + parse->algorithm); + + /* Parse the checksum payload, if it's present. */ + checksum_string_length = parse->checksum == NULL ? 0 + : strlen(parse->checksum); + if (checksum_string_length == 0) + { + checksum_length = 0; + checksum_payload = NULL; + } + else + { + checksum_length = checksum_string_length / 2; + checksum_payload = palloc(checksum_length); + if (checksum_string_length % 2 != 0 || + !hexdecode_string(checksum_payload, parse->checksum, + checksum_length)) + context->error_cb(context, + "invalid checksum for file \"%s\": \"%s\"", + parse->pathname, parse->checksum); + } + + /* Invoke the callback with the details we've gathered. */ + context->perfile_cb(context, parse->pathname, size, + checksum_type, checksum_length, checksum_payload); + + /* Free memory we no longer need. */ + if (parse->size != NULL) + { + pfree(parse->size); + parse->size = NULL; + } + if (parse->algorithm != NULL) + { + pfree(parse->algorithm); + parse->algorithm = NULL; + } + if (parse->checksum != NULL) + { + pfree(parse->checksum); + parse->checksum = NULL; + } +} + +/* + * Verify that the manifest checksum is correct. + * + * The last line of the manifest file is excluded from the manifest checksum, + * because the last line is expected to contain the checksum that covers + * the rest of the file. + */ +static void +verify_manifest_checksum(JsonManifestParseState *parse, char *buffer, + size_t size) +{ + JsonManifestParseContext *context = parse->context; + size_t i; + size_t number_of_newlines = 0; + size_t ultimate_newline = 0; + size_t penultimate_newline = 0; + pg_sha256_ctx manifest_ctx; + uint8 manifest_checksum_actual[PG_SHA256_DIGEST_LENGTH]; + uint8 manifest_checksum_expected[PG_SHA256_DIGEST_LENGTH]; + + /* Find the last two newlines in the file. */ + for (i = 0; i < size; ++i) + { + if (buffer[i] == '\n') + { + ++number_of_newlines; + penultimate_newline = ultimate_newline; + ultimate_newline = i; + } + } + + /* + * Make sure that the last newline is right at the end, and that there are + * at least two lines total. We need this to be true in order for the + * following code, which computes the manifest checksum, to work properly. + */ + if (number_of_newlines < 2) + json_manifest_parse_failure(parse->context, + "expected at least 2 lines"); + if (ultimate_newline != size - 1) + json_manifest_parse_failure(parse->context, + "last line not newline-terminated"); + + /* Checksum the rest. */ + pg_sha256_init(&manifest_ctx); + pg_sha256_update(&manifest_ctx, (uint8 *) buffer, penultimate_newline + 1); + pg_sha256_final(&manifest_ctx, manifest_checksum_actual); + + /* Now verify it. */ + if (parse->manifest_checksum == NULL) + context->error_cb(parse->context, "manifest has no checksum"); + if (strlen(parse->manifest_checksum) != PG_SHA256_DIGEST_LENGTH * 2 || + !hexdecode_string(manifest_checksum_expected, parse->manifest_checksum, + PG_SHA256_DIGEST_LENGTH)) + context->error_cb(context, "invalid manifest checksum: \"%s\"", + parse->manifest_checksum); + if (memcmp(manifest_checksum_actual, manifest_checksum_expected, + PG_SHA256_DIGEST_LENGTH) != 0) + context->error_cb(context, "manifest checksum mismatch"); +} + +/* + * Report a parse error. + * + * This is intended to be used for fairly low-level failures that probably + * shouldn't occur unless somebody has deliberately constructed a bad manifest, + * or unless the server is generating bad manifests due to some bug. msg should + * be a short string giving some hint as to what the problem is. + */ +static void +json_manifest_parse_failure(JsonManifestParseContext *context, char *msg) +{ + context->error_cb(context, "could not parse backup manifest: %s", msg); +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +hexdecode_char(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +static bool +hexdecode_string(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = hexdecode_char(input[i * 2]); + int n2 = hexdecode_char(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} diff --git a/src/bin/pg_validatebackup/parse_manifest.h b/src/bin/pg_validatebackup/parse_manifest.h new file mode 100644 index 0000000000..b0b18a57ca --- /dev/null +++ b/src/bin/pg_validatebackup/parse_manifest.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * parse_manifest.h + * Parse a backup manifest in JSON format. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_validatebackup/parse_manifest.h + * + *------------------------------------------------------------------------- + */ + +#ifndef PARSE_MANIFEST_H +#define PARSE_MANIFEST_H + +#include "common/checksum_helper.h" +#include "mb/pg_wchar.h" + +struct JsonManifestParseContext; +typedef struct JsonManifestParseContext JsonManifestParseContext; + +typedef void (*json_manifest_perfile_callback)(JsonManifestParseContext *, + char *pathname, + size_t size, pg_checksum_type checksum_type, + int checksum_length, uint8 *checksum_payload); +typedef void (*json_manifest_error_callback)(JsonManifestParseContext *, + char *fmt, ...); + +struct JsonManifestParseContext +{ + void *private_data; + json_manifest_perfile_callback perfile_cb; + json_manifest_error_callback error_cb; +}; + +extern void json_parse_manifest(JsonManifestParseContext *context, + char *buffer, size_t size); + +#endif diff --git a/src/bin/pg_validatebackup/pg_validatebackup.c b/src/bin/pg_validatebackup/pg_validatebackup.c new file mode 100644 index 0000000000..0e7299b1b9 --- /dev/null +++ b/src/bin/pg_validatebackup/pg_validatebackup.c @@ -0,0 +1,732 @@ +/*------------------------------------------------------------------------- + * + * pg_validatebackup.c + * Validate a backup against a backup manifest. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_validatebackup/pg_validatebackup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include +#include + +#include "common/hashfn.h" +#include "common/logging.h" +#include "fe_utils/simple_list.h" +#include "getopt_long.h" +#include "parse_manifest.h" + +/* + * For efficiency, we'd like our hash table containing information about the + * manifest to start out with approximately the correct number of entries. + * There's no way to know the exact number of entries without reading the whole + * file, but we can get an estimate by dividing the file size by the estimated + * number of bytes per line. + * + * This could be off by about a factor of two in either direction, because the + * checksum algorithm has a big impact on the line lengths; e.g. a SHA512 + * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there + * might be no checksum at all. + */ +#define ESTIMATED_BYTES_PER_MANIFEST_LINE 100 + +/* + * How many bytes should we try to read from a file at once? + */ +#define READ_CHUNK_SIZE 4096 + +/* + * Information about each file described by the manifest file is parsed to + * produce an object like this. + */ +typedef struct manifestfile +{ + uint32 status; /* hash status */ + char *pathname; + size_t size; + pg_checksum_type checksum_type; + int checksum_length; + uint8 *checksum_payload; + bool matched; + bool bad; +} manifestfile; + +/* + * Define a hash table which we can use to store information about the files + * mentioned in the backup manifest. + */ +static uint32 hash_string_pointer(char *s); +#define SH_PREFIX manifestfiles +#define SH_ELEMENT_TYPE manifestfile +#define SH_KEY_TYPE char * +#define SH_KEY pathname +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +/* + * All of the context information we need while checking a backup manifest. + */ +typedef struct validator_context +{ + manifestfiles_hash *ht; + char *backup_directory; + SimpleStringList ignore_list; + bool exit_on_error; + bool saw_any_error; +} validator_context; + +static manifestfiles_hash *parse_manifest_file(char *manifest_path); + +static void record_manifest_details_for_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void report_manifest_error(JsonManifestParseContext *context, + char *fmt, ...); + +static void validate_backup_directory(validator_context *context, + char *relpath, char *fullpath); +static void validate_backup_file(validator_context *context, + char *relpath, char *fullpath); +static void report_extra_backup_files(validator_context *context); +static void validate_backup_checksums(validator_context *context); +static void validate_file_checksum(validator_context *context, + manifestfile *tabent, char *pathname); + +static void report_backup_error(validator_context *context, + const char *pg_restrict fmt,...) + pg_attribute_printf(2, 3); +static void report_fatal_error(const char *pg_restrict fmt,...) + pg_attribute_printf(1, 2) pg_attribute_noreturn(); +static bool should_ignore_relpath(validator_context *context, char *relpath); + +static void usage(void); + +static const char *progname; + +/* + * Main entry point. + */ +int +main(int argc, char **argv) +{ + static struct option long_options[] = { + {"exit-on-error", no_argument, NULL, 'e'}, + {"ignore", required_argument, NULL, 'i'}, + {"manifest-path", required_argument, NULL, 'm'}, + {"quiet", no_argument, NULL, 'q'}, + {"skip-checksums", no_argument, NULL, 's'}, + {NULL, 0, NULL, 0} + }; + + int c; + validator_context context; + char *manifest_path = NULL; + bool quiet = false; + bool skip_checksums = false; + + pg_logging_init(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_validatebackup")); + progname = get_progname(argv[0]); + + memset(&context, 0, sizeof(context)); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_validatebackup (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + /* + * Skip certain files in the toplevel directory. + * + * Ignore the backup_manifest file, because it's not included in the + * backup manifest. + * + * Ignore the pg_wal directory, because those files are not included in + * the backup manifest either, since they are fetched separately from the + * backup itself. + * + * Ignore postgresql.auto.conf, recovery.signal, and standby.signal, + * because we expect that those files may sometimes be created or changed + * as part of the backup process. For example, pg_basebackup -R will + * modify postgresql.auto.conf and create standby.signal. + */ + simple_string_list_append(&context.ignore_list, "backup_manifest"); + simple_string_list_append(&context.ignore_list, "pg_wal"); + simple_string_list_append(&context.ignore_list, "postgresql.auto.conf"); + simple_string_list_append(&context.ignore_list, "recovery.signal"); + simple_string_list_append(&context.ignore_list, "standby.signal"); + + while ((c = getopt_long(argc, argv, "ei:m:qs", long_options, NULL)) != -1) + { + switch (c) + { + case 'e': + context.exit_on_error = true; + break; + case 'i': + { + char *arg = pstrdup(optarg); + + canonicalize_path(arg); + simple_string_list_append(&context.ignore_list, arg); + break; + } + case 'm': + manifest_path = pstrdup(optarg); + canonicalize_path(manifest_path); + break; + case 'q': + quiet = true; + break; + case 's': + skip_checksums = true; + break; + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + } + + /* Get backup directory name */ + if (optind >= argc) + { + pg_log_fatal("no backup directory specified"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + context.backup_directory = pstrdup(argv[optind++]); + canonicalize_path(context.backup_directory); + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_fatal("too many command-line arguments (first is \"%s\")", + argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + /* By default, look for the manifest in the backup directory. */ + if (manifest_path == NULL) + manifest_path = psprintf("%s/backup_manifest", + context.backup_directory); + + /* + * Try to read the manifest. We treat any errors encountered while parsing + * the manifest as fatal; there doesn't seem to be much point in trying to + * validate the backup directory against a corrupted manifest. + */ + context.ht = parse_manifest_file(manifest_path); + + /* + * Now scan the files in the backup directory. At this stage, we verify + * that every file on disk is present in the manifest and that the sizes + * match. We also set the "matched" flag on every manifest entry that + * corresponds to a file on disk. + */ + validate_backup_directory(&context, NULL, context.backup_directory); + + /* + * The "matched" flag should now be set on every entry in the hash table. + * Any entries for which the bit is not set are files mentioned in the + * manifest that don't exist on disk. + */ + report_extra_backup_files(&context); + + /* + * Finally, do the expensive work of verifying file checksums, unless we + * were told to skip it. + */ + if (!skip_checksums) + validate_backup_checksums(&context); + + /* + * If everything looks OK, tell the user this, unless we were asked to + * work quietly. + */ + if (!context.saw_any_error && !quiet) + printf("backup successfully verified\n"); + + return context.saw_any_error ? 1 : 0; +} + +/* + * Parse a manifest file and construct a hash table with information about + * all the files it mentions. + */ +static manifestfiles_hash * +parse_manifest_file(char *manifest_path) +{ + int fd; + struct stat statbuf; + off_t estimate; + uint32 initial_size; + manifestfiles_hash *ht; + char *buffer; + int rc; + JsonManifestParseContext context; + + /* Open the manifest file. */ + if ((fd = open(manifest_path, O_RDONLY | PG_BINARY, 0)) < 0) + report_fatal_error("could not open file \"%s\": %m", manifest_path); + + /* Figure out how big the manifest is. */ + if (fstat(fd, &statbuf) != 0) + report_fatal_error("could not stat file \"%s\": %m", manifest_path); + + /* Guess how large to make the hash table based on the manifest size. */ + estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE; + initial_size = Min(PG_UINT32_MAX, Max(estimate, 256)); + + /* Create the hash table. */ + ht = manifestfiles_create(initial_size, NULL); + + /* + * Slurp in the whole file. + * + * This is not ideal, but there's currently no easy way to get + * pg_parse_json() to perform incremental parsing. + */ + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + report_fatal_error("could not read file \"%s\": %m", + manifest_path); + else + report_fatal_error("could not read file \"%s\": read %d of %zu", + manifest_path, rc, (size_t) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest as JSON. */ + context.private_data = ht; + context.perfile_cb = record_manifest_details_for_file; + context.error_cb = report_manifest_error; + json_parse_manifest(&context, buffer, statbuf.st_size); + + /* Done with the buffer. */ + pfree(buffer); + + /* Return the hash table we constructed. */ + return ht; +} + +/* + * Report an error while parsing the manifest. + * + * We consider all such errors to be fatal errors. The manifest parser + * expects this function not to return. + */ +static void +report_manifest_error(JsonManifestParseContext *context, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_FATAL, fmt, ap); + va_end(ap); + + exit(1); +} + +/* + * Record details extracted from the backup manifest for one file. + */ +static void +record_manifest_details_for_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, uint8 *checksum_payload) +{ + manifestfiles_hash *ht = context->private_data; + manifestfile *tabent; + bool found; + + /* Make a new entry in the hash table for this file. */ + tabent = manifestfiles_insert(ht, pathname, &found); + if (found) + report_fatal_error("duplicate pathname in backup manifest: \"%s\"", + pathname); + + /* Initialize the entry. */ + tabent->size = size; + tabent->checksum_type = checksum_type; + tabent->checksum_length = checksum_length; + tabent->checksum_payload = checksum_payload; + tabent->matched = false; + tabent->bad = false; +} + +/* + * Validate one directory. + * + * 'relpath' is NULL if we are to validate the top-level backup directory, + * and otherwise the relative path to the directory that is to be validated. + * + * 'fullpath' is the backup directory with 'relpath' appended; i.e. the actual + * filesystem path at which it can be found. + */ +static void +validate_backup_directory(validator_context *context, char *relpath, + char *fullpath) +{ + DIR *dir; + struct dirent *dirent; + + dir = opendir(fullpath); + if (dir == NULL) + { + /* + * If even the toplevel backup directory cannot be found, treat this + * as a fatal error. + */ + if (relpath == NULL) + report_fatal_error("could not open directory \"%s\": %m", fullpath); + + /* + * Otherwise, treat this as a non-fatal error, but ignore any further + * errors related to this path and anything beneath it. + */ + report_backup_error(context, + "could not open directory \"%s\": %m", fullpath); + simple_string_list_append(&context->ignore_list, relpath); + + return; + } + + while (errno = 0, (dirent = readdir(dir)) != NULL) + { + char *filename = dirent->d_name; + char *newfullpath = psprintf("%s/%s", fullpath, filename); + char *newrelpath; + + /* Skip "." and ".." */ + if (filename[0] == '.' && (filename[1] == '\0' + || strcmp(filename, "..") == 0)) + continue; + + if (relpath == NULL) + newrelpath = pstrdup(filename); + else + newrelpath = psprintf("%s/%s", relpath, filename); + + if (!should_ignore_relpath(context, newrelpath)) + validate_backup_file(context, newrelpath, newfullpath); + + pfree(newfullpath); + pfree(newrelpath); + } + + if (closedir(dir)) + { + report_backup_error(context, + "could not close directory \"%s\": %m", fullpath); + return; + } +} + +/* + * Validate one file (which might actually be a directory or a symlink). + * + * The arguments to this function have the same meaning as the arguments to + * validate_backup_directory. + */ +static void +validate_backup_file(validator_context *context, char *relpath, char *fullpath) +{ + struct stat sb; + manifestfile *tabent; + + if (stat(fullpath, &sb) != 0) + { + report_backup_error(context, + "could not stat file or directory \"%s\": %m", + relpath); + + /* + * Suppress further errors related to this path name and, if it's a + * directory, anything underneath it. + */ + simple_string_list_append(&context->ignore_list, relpath); + + return; + } + + /* If it's a directory, just recurse. */ + if (S_ISDIR(sb.st_mode)) + { + validate_backup_directory(context, relpath, fullpath); + return; + } + + /* If it's not a directory, it should be a plain file. */ + if (!S_ISREG(sb.st_mode)) + { + report_backup_error(context, + "\"%s\" is not a file or directory", + relpath); + return; + } + + /* Check whether there's an entry in the manifest hash. */ + tabent = manifestfiles_lookup(context->ht, relpath); + if (tabent == NULL) + { + report_backup_error(context, + "\"%s\" is present on disk but not in the manifest", + relpath); + return; + } + + /* Flag this entry as having been encountered in the filesystem. */ + tabent->matched = true; + + /* Check that the size matches. */ + if (tabent->size != sb.st_size) + { + report_backup_error(context, + "\"%s\" has size %zu on disk but size %zu in the manifest", + relpath, (size_t) sb.st_size, tabent->size); + tabent->bad = true; + } + + /* + * We don't validate checksums at this stage. We first finish validating + * that we have the expected set of files with the expected sizes, and + * only afterwards verify the checksums. That's because computing + * checksums may take a while, and we'd like to report more obvious + * problems quickly. + */ +} + +/* + * Scan the hash table for entries where the 'matched' flag is not set; report + * that such files are present in the manifest but not on disk. + */ +static void +report_extra_backup_files(validator_context *context) +{ + manifestfiles_iterator it; + manifestfile *tabent; + + manifestfiles_start_iterate(context->ht, &it); + while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL) + if (!tabent->matched && + !should_ignore_relpath(context, tabent->pathname)) + report_backup_error(context, + "\"%s\" is present in the manifest but not on disk", + tabent->pathname); +} + +/* + * Validate checksums for hash table entries that are otherwise unproblematic. + * If we've already reported some problem related to a hash table entry, or + * if it has no checksum, just skip it. + */ +static void +validate_backup_checksums(validator_context *context) +{ + manifestfiles_iterator it; + manifestfile *tabent; + + manifestfiles_start_iterate(context->ht, &it); + while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL) + { + if (tabent->matched && !tabent->bad && + tabent->checksum_type != CHECKSUM_TYPE_NONE && + !should_ignore_relpath(context, tabent->pathname)) + { + char *fullpath; + + /* Compute the full pathname to the target file. */ + fullpath = psprintf("%s/%s", context->backup_directory, + tabent->pathname); + + /* Do the actual checksum validation. */ + validate_file_checksum(context, tabent, fullpath); + + /* Avoid leaking memory. */ + pfree(fullpath); + } + } +} + +/* + * Validate the checksum of a single file. + */ +static void +validate_file_checksum(validator_context *context, manifestfile *tabent, + char *fullpath) +{ + pg_checksum_context checksum_ctx; + char *relpath = tabent->pathname; + int fd; + int rc; + uint8 buffer[READ_CHUNK_SIZE]; + uint8 checksumbuf[PG_CHECKSUM_MAX_LENGTH]; + int checksumlen; + + /* Open the target file. */ + if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) < 0) + { + report_backup_error(context, "could not open file \"%s\": %m", + relpath); + return; + } + + /* Initialize checksum context. */ + pg_checksum_init(&checksum_ctx, tabent->checksum_type); + + /* Read the file chunk by chunk, updating the checksum as we go. */ + while ((rc = read(fd, buffer, READ_CHUNK_SIZE)) > 0) + pg_checksum_update(&checksum_ctx, buffer, rc); + if (rc < 0) + report_backup_error(context, "could not read file \"%s\": %m", + relpath); + + /* Close the file. */ + if (close(fd) != 0) + { + report_backup_error(context, "could not close file \"%s\": %m", + relpath); + return; + } + + /* If we didn't manage to read the whole file, bail out now. */ + if (rc < 0) + return; + + /* Get the final checksum. */ + checksumlen = pg_checksum_final(&checksum_ctx, checksumbuf); + + /* And check it against the manifest. */ + if (checksumlen != tabent->checksum_length) + report_backup_error(context, + "file \"%s\" has checksum of length %d, but expected %d", + relpath, tabent->checksum_length, checksumlen); + else if (memcmp(checksumbuf, tabent->checksum_payload, checksumlen) != 0) + report_backup_error(context, + "checksum mismatch for file \"%s\"", + relpath); +} + +/* + * Report a problem with the backup. + * + * Update the context to indicate that we saw an error, and exit if the + * context says we should. + */ +static void +report_backup_error(validator_context *context, const char *pg_restrict fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_ERROR, fmt, ap); + va_end(ap); + + context->saw_any_error = true; + if (context->exit_on_error) + exit(1); +} + +/* + * Report a fatal error and exit + */ +static void +report_fatal_error(const char *pg_restrict fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_FATAL, fmt, ap); + va_end(ap); + + exit(1); +} + +/* + * Is the specified relative path, or some prefix of it, listed in the set + * of paths to ignore? + * + * Note that by "prefix" we mean a parent directory; for this purpose, + * "aa/bb" is not a prefix of "aa/bbb", but it is a prefix of "aa/bb/cc". + */ +static bool +should_ignore_relpath(validator_context *context, char *relpath) +{ + SimpleStringListCell *cell; + + for (cell = context->ignore_list.head; cell != NULL; cell = cell->next) + { + char *r = relpath; + char *v = cell->val; + + while (*v != '\0' && *r == *v) + ++r, ++v; + + if (*v == '\0' && (*r == '\0' || *r == '/')) + return true; + } + + return false; +} + +/* + * Helper function for manifestfiles hash table. + */ +static uint32 +hash_string_pointer(char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} + +/* + * Print out usage information and exit. + */ +static void +usage(void) +{ + printf(_("%s validates a backup against the backup manifest.\n\n"), progname); + printf(_("Usage:\n %s [OPTION]... BACKUPDIR\n\n"), progname); + printf(_("Options:\n")); + printf(_(" -e, --exit-on-error exit immediately on error\n")); + printf(_(" -i, --ignore=RELATIVE_PATH ignore indicated path\n")); + printf(_(" -m, --manifest=PATH use specified path for manifest\n")); + printf(_(" -s, --skip-checksums skip checksum verification\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf(_("\nReport bugs to .\n")); +} -- 2.17.2 (Apple Git-113)