From 527eaf616026b8132937c543e961204b7051145c Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 27 Feb 2020 21:05:02 +0530 Subject: [PATCH v8 5/5] WIP: Validate JSON-format manifest. --- src/bin/pg_validatebackup/pg_validatebackup.c | 690 ++++++++---------- 1 file changed, 315 insertions(+), 375 deletions(-) diff --git a/src/bin/pg_validatebackup/pg_validatebackup.c b/src/bin/pg_validatebackup/pg_validatebackup.c index 4f47b20855..1b0a449470 100644 --- a/src/bin/pg_validatebackup/pg_validatebackup.c +++ b/src/bin/pg_validatebackup/pg_validatebackup.c @@ -19,9 +19,11 @@ #include "common/checksum_helper.h" #include "common/hashfn.h" +#include "common/jsonapi.h" #include "common/logging.h" #include "fe_utils/simple_list.h" #include "getopt_long.h" +#include "mb/pg_wchar.h" /* * For efficiency, we'd like our hash table containing information about the @@ -60,8 +62,8 @@ #define FIELDS_PER_FILE_LINE 4 /* - * Each "File" line in the manifest file is parsed to produce an object - * like this. + * Information about each file described by the manifest file is parsed to + * produce an object like this. */ typedef struct manifestfile { @@ -92,6 +94,49 @@ static uint32 hash_string_pointer(char *s); #define SH_DEFINE #include "lib/simplehash.h" +/* + * Semantic states for JSON manifest parsing. + */ +typedef enum +{ + JM_EXPECT_TOPLEVEL_START, + JM_EXPECT_TOPLEVEL_END, + JM_EXPECT_VERSION_FIELD, + JM_EXPECT_VERSION_VALUE, + JM_EXPECT_FILES_FIELD, + JM_EXPECT_FILES_ARRAY_START, + JM_EXPECT_FILES_ARRAY_NEXT, + JM_EXPECT_THIS_FILE_FIELD, + JM_EXPECT_THIS_FILE_VALUE, + JM_EXPECT_MANIFEST_CHECKSUM_FIELD, + JM_EXPECT_MANIFEST_CHECKSUM_VALUE, + JM_EXPECT_EOF +} JsonManifestSemanticState; + +/* + * Possible fields for one file as described by the manifest. + */ +typedef enum +{ + JMFF_PATH, + JMFF_SIZE, + JMFF_LAST_MODIFIED, + JMFF_CHECKSUM_ALGORITHM, + JMFF_CHECKSUM +} JsonManifestFileField; + +typedef struct +{ + JsonManifestSemanticState state; + JsonManifestFileField field; + manifestfiles_hash *ht; + char *pathname; + char *size; + char *algorithm; + pg_checksum_type checksum_algorithm; + char *checksum; +} JsonManifestParseState; + /* * All of the context information we need while checking a backup manifest. */ @@ -105,8 +150,15 @@ typedef struct validator_context } validator_context; static manifestfiles_hash * parse_manifest_file(char *manifest_path); -static void parse_file_line_from_manifest(manifestfile *f, char *rest, - int restlen); +static void json_manifest_object_start(void *state); +static void json_manifest_object_end(void *state); +static void json_manifest_array_start(void *state); +static void json_manifest_array_end(void *state); +static void json_manifest_object_field_start(void *state, char *fname, + bool isnull); +static void json_manifest_scalar(void *state, char *token, + JsonTokenType tokentype); + static void validate_backup_directory(validator_context *context, char *relpath, char *fullpath); static void validate_backup_file(validator_context *context, @@ -121,9 +173,6 @@ static void pg_validator_error(validator_context *context, pg_attribute_printf(2, 3); static bool should_ignore_relpath(validator_context *context, char *relpath); -static char *extractstr(char *buffer, int length); -static int findchar(char *buffer, int size, char c, int start_position); -static int findfield(char *buffer, char *end, char **result); static int hexdecode_char(char c); static bool hexdecode_string(uint8 *result, char *input, int nbytes); static void usage(void); @@ -275,19 +324,15 @@ parse_manifest_file(char *manifest_path) int fd; struct stat statbuf; off_t estimate; - off_t bytes_read = 0; - off_t bytes_consumed = 0; uint32 initial_size; manifestfiles_hash *ht; char *buffer; - uint8 manifest_checksum_actual[PG_SHA256_DIGEST_LENGTH]; - uint8 manifest_checksum_expected[PG_SHA256_DIGEST_LENGTH]; - int buffer_position = 0; - int buffer_size = 0; - int buffer_maxsize = 2 * READ_CHUNK_SIZE; - int line_number = 0; - bool saw_manifest_checksum_line = false; + int rc; pg_sha256_ctx manifest_ctx; + JsonLexContext *lex; + JsonParseErrorType json_error; + JsonSemAction sem; + JsonManifestParseState parse; /* Prepare to compute a checksum of the manifest itself. */ pg_sha256_init(&manifest_ctx); @@ -313,296 +358,310 @@ parse_manifest_file(char *manifest_path) /* Create the hash table. */ ht = manifestfiles_create(initial_size, NULL); - /* Initialize our read buffer. */ - buffer = pg_malloc(buffer_maxsize); - /* - * Loop until we've read it all. + * Slurp in the whole file. * - * The file size shouldn't be changing, so it seems fine to just error out - * if the final length is different from what stat() told us. + * This is not ideal, but there's currently no easy way to get + * pg_parse_json() to perform incremental parsing. */ - while (bytes_consumed < statbuf.st_size) + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) { - int line_length; - int first_field_length; - char *rest; - int restlen; + if (rc < 0) + pg_log_fatal("could not read file \"%s\": %m", + manifest_path); + else + pg_log_fatal("could not read file \"%s\": read %d of %zu", + manifest_path, rc, (size_t) statbuf.st_size); + exit(1); + } - /* Find next newline if any. */ - line_length = findchar(buffer, buffer_size, '\n', buffer_position); + /* Create a JSON lexing context. */ + lex = makeJsonLexContextCstringLen(buffer, statbuf.st_size, PG_UTF8, true); + + /* Set up semantic actions. */ + parse.state = JM_EXPECT_TOPLEVEL_START; + parse.ht = ht; + sem.semstate = &parse; + sem.object_start = json_manifest_object_start; + sem.object_end = json_manifest_object_end; + sem.array_start = json_manifest_array_start; + sem.array_end = json_manifest_array_end; + sem.object_field_start = json_manifest_object_field_start; + sem.object_field_end = NULL; + sem.array_element_start = NULL; + sem.array_element_end = NULL; + sem.scalar = json_manifest_scalar; + + /* Parse JSON. */ + json_error = pg_parse_json(lex, &sem); + if (json_error != JSON_SUCCESS) + { + pg_log_fatal("could not parse backup manifest: %s", + json_errdetail(json_error, lex)); + exit(1); + } + if (parse.state != JM_EXPECT_EOF) + { + pg_log_fatal("could not parse backup manifest: %s", + "manifest ended unexpectedly"); + } - /* If no newline was found, we need to read more data and try again. */ - if (line_length == -1) - { - size_t bytes_to_read; - int rc; + /* OK, we're done with the manifest file. */ + close(fd); - bytes_to_read = Min(statbuf.st_size - bytes_read, READ_CHUNK_SIZE); - if (bytes_to_read == 0) - { - pg_log_fatal("manifest file line not terminated by newline"); - exit(1); - } - if (bytes_to_read + READ_CHUNK_SIZE > buffer_maxsize) - { - buffer_maxsize += READ_CHUNK_SIZE; - buffer = pg_realloc(buffer, buffer_maxsize); - Assert(bytes_to_read + READ_CHUNK_SIZE <= buffer_maxsize); - } - rc = read(fd, buffer + buffer_size, bytes_to_read); - if (rc != bytes_to_read) + /* Return the hash table we constructed. */ + return ht; +} + +static void +json_manifest_parse_failure(char *msg) +{ + pg_log_fatal("could not parse backup manifest: %s", msg); + exit(1); +} + +static void +json_manifest_object_start(void *state) +{ + JsonManifestParseState *parse = state; + + switch (parse->state) + { + case JM_EXPECT_TOPLEVEL_START: + parse->state = JM_EXPECT_VERSION_FIELD; + break; + case JM_EXPECT_FILES_ARRAY_NEXT: + parse->state = JM_EXPECT_THIS_FILE_FIELD; + parse->pathname = NULL; + parse->algorithm = NULL; + parse->checksum = NULL; + break; + default: + json_manifest_parse_failure("unexpected object start"); + break; + } +} + +static void +json_manifest_object_end(void *state) +{ + JsonManifestParseState *parse = state; + manifestfile *tabent; + bool found; + int checksum_string_length; + char *ep; + + switch (parse->state) + { + case JM_EXPECT_TOPLEVEL_END: + parse->state = JM_EXPECT_EOF; + break; + case JM_EXPECT_THIS_FILE_FIELD: + /* Pathname and size are required. */ + if (parse->pathname == NULL) + json_manifest_parse_failure("missing pathname"); + if (parse->size == NULL) + json_manifest_parse_failure("missing size"); + if (parse->algorithm == NULL && parse->checksum != NULL) + json_manifest_parse_failure("checksum without algorithm"); + + /* Make a new entry in the hash table for this file. */ + tabent = manifestfiles_insert(parse->ht, parse->pathname, &found); + if (found) { - if (rc < 0) - pg_log_fatal("could not read file \"%s\": %m", - manifest_path); - else - pg_log_fatal("could not read file \"%s\": read %d of %zu", - manifest_path, rc, bytes_to_read); + pg_log_fatal("duplicate pathname in backup manifest: \"%s\"", + parse->pathname); exit(1); } - buffer_size += rc; - bytes_read += rc; - continue; - } - /* Increment line number. */ - ++line_number; - - /* The manifest checksum should be the last thing in the file. */ - if (saw_manifest_checksum_line) - { - pg_log_fatal("unexpected data follows manifest checksum"); - exit(1); - } + /* Initialize some fields. */ + tabent->matched = false; + tabent->bad = false; - /* Find first field on line, and remaining line contents. */ - first_field_length = - findchar(buffer, buffer_size, '\t', buffer_position); - rest = buffer + buffer_position + first_field_length + 1; - restlen = line_length - (first_field_length + 1); + /* Parse size. */ + tabent->size = strtoll(parse->size, &ep, 10); + if (*ep) + json_manifest_parse_failure("file size is not an integer"); - /* - * Check the first word of the line to see what kind of line it is. - */ - if (first_field_length == KWL_MANIFEST_VERSION && - memcmp(buffer + buffer_position, KW_MANIFEST_VERSION, - KWL_MANIFEST_VERSION) == 0) - { - if (line_number != 1) + /* Parse the checksum algorithm, if it's present. */ + if (parse->algorithm == NULL) + tabent->checksum_type = CHECKSUM_TYPE_NONE; + else if (!pg_checksum_parse_type(parse->algorithm, + &tabent->checksum_type)) { - pg_log_fatal("manifest file version should only be specified at line 1"); + pg_log_fatal("unrecognized checksum algorithm: \"%s\"", + parse->algorithm); exit(1); } + + /* Parse the checksum payload, if it's present. */ + checksum_string_length = parse->checksum == NULL ? 0 + : strlen(parse->checksum); + if (checksum_string_length == 0) + { + tabent->checksum_length = 0; + tabent->checksum_payload = NULL; + } else { - char *line = buffer + buffer_position; - char *version; - version = extractstr(line + first_field_length + 1, - line_length - (first_field_length + 1)); - if (strcmp(version, "1") != 0) + tabent->checksum_length = checksum_string_length / 2; + tabent->checksum_payload = palloc(tabent->checksum_length); + if (checksum_string_length % 2 != 0 || + !hexdecode_string(tabent->checksum_payload, + parse->checksum, + tabent->checksum_length)) { - pg_log_fatal("unrecognized manifest version: \"%s\"", - version); + pg_log_fatal("invalid checksum for file \"%s\": \"%s\"", + parse->pathname, tabent->checksum_payload); exit(1); } } - } - else if (first_field_length == KWL_MANIFEST_FILE && - memcmp(buffer + buffer_position, KW_MANIFEST_FILE, - KWL_MANIFEST_FILE) == 0) - { - manifestfile f; - manifestfile *tabent; - bool found; - /* Parse this line. */ - parse_file_line_from_manifest(&f, rest, restlen); - - /* Make a new entry in the hash table for it. */ - tabent = manifestfiles_insert(ht, f.pathname, &found); - if (found) + /* Free memory we no longer need. */ + if (parse->size != NULL) { - pg_log_fatal("duplicate pathname in backup manifest: \"%s\"", - f.pathname); - exit(1); + pfree(parse->size); + parse->size = NULL; } - - /* Copy in all the relevant details. */ - tabent->size = f.size; - tabent->checksum_type = f.checksum_type; - tabent->checksum_length = f.checksum_length; - tabent->checksum_payload = f.checksum_payload; - tabent->matched = false; - tabent->bad = false; - } - else if (first_field_length == KWL_MANIFEST_CHECKSUM && - memcmp(buffer + buffer_position, KW_MANIFEST_CHECKSUM, - KWL_MANIFEST_CHECKSUM) == 0) - { - saw_manifest_checksum_line = true; - if (restlen != PG_SHA256_DIGEST_STRING_LENGTH - 1) + if (parse->algorithm != NULL) { - pg_log_fatal("manifest file checksum has unexpected length: %d", - restlen); - exit(1); + pfree(parse->algorithm); + parse->algorithm = NULL; } - if (!hexdecode_string(manifest_checksum_expected, rest, - PG_SHA256_DIGEST_LENGTH)) + if (parse->checksum != NULL) { - pg_log_fatal("invalid manifest checksum: \"%s\"", - extractstr(rest, restlen)); - exit(1); + pfree(parse->checksum); + parse->checksum = NULL; } - } - else if (first_field_length == -1) - { - pg_log_fatal("manifest file keyword not terminated by tab"); - exit(1); - } - else - { - char *kw; - - kw = extractstr(buffer + buffer_position, first_field_length); - pg_log_fatal("unrecognized manifest file keyword: \"%s\"", kw); - exit(1); - } - - /* Update manifest checksum, if needed. */ - if (!saw_manifest_checksum_line) - pg_sha256_update(&manifest_ctx, (uint8 *) buffer + buffer_position, - line_length + 1); - /* Advance buffer position over the data we just read. */ - buffer_position += line_length + 1; - - /* Also mark these bytes as consumed so we know when to stop. */ - bytes_consumed += line_length + 1; - - /* - * We don't want to incur the expensive of using memmove() to discard - * data after every line, because the lines are short compared to the - * chunk size -- but we must do it at least now and then, or we'll - * have to keep growing the buffer. - */ - if (buffer_position >= READ_CHUNK_SIZE) - { - int leftover_bytes = buffer_size - buffer_position; - - if (leftover_bytes > 0) - memmove(buffer, buffer + buffer_position, leftover_bytes); - buffer_size -= buffer_position; - buffer_position = 0; - } + /* Expect next file (or end of list). */ + parse->state = JM_EXPECT_FILES_ARRAY_NEXT; + break; + default: + json_manifest_parse_failure("unexpected object end"); + break; } +} + +static void +json_manifest_array_start(void *state) +{ + JsonManifestParseState *parse = state; - /* Checksum verification. */ - if (!saw_manifest_checksum_line) - pg_log_fatal("manifest has no checksum"); - pg_sha256_final(&manifest_ctx, manifest_checksum_actual); - if (memcmp(manifest_checksum_actual, manifest_checksum_expected, - PG_SHA256_DIGEST_LENGTH) != 0) + switch (parse->state) { - pg_log_fatal("manifest checksum does not match"); - exit(1); + case JM_EXPECT_FILES_ARRAY_START: + parse->state = JM_EXPECT_FILES_ARRAY_NEXT; + break; + default: + json_manifest_parse_failure("unexpected array start"); + break; } - - /* OK, we're done with the manifest file. */ - close(fd); - - /* Return the hash table we constructed. */ - return ht; } -/* - * The caller passes the remainder of the line, excluding the initial "File\t" - * portion. - */ static void -parse_file_line_from_manifest(manifestfile *f, char *rest, int restlen) +json_manifest_array_end(void *state) { - char *end = rest + restlen; - char *field[FIELDS_PER_FILE_LINE]; - unsigned long filesize; - char *ep; - pg_checksum_type checksum_type; - int raw_checksum_length = 0; - char *raw_checksum_payload = NULL; - int checksum_length; - uint8 *checksum_payload; - int i; - char *s; + JsonManifestParseState *parse = state; - /* Split the line into fields. */ - for (i = 0; i < FIELDS_PER_FILE_LINE; ++i) + switch (parse->state) { - int toklen; - - toklen = findfield(rest, end, &field[i]); - if (rest + toklen >= end && i + 1 < FIELDS_PER_FILE_LINE) - { - pg_log_fatal("manifest file line has too few fields"); - exit(1); - } - rest += toklen + 1; + case JM_EXPECT_FILES_ARRAY_NEXT: + parse->state = JM_EXPECT_MANIFEST_CHECKSUM_FIELD; + break; + default: + json_manifest_parse_failure("unexpected array end"); + break; } +} - /* We expect to have used the entire line. */ - if (rest < end) - { - pg_log_fatal("manifest file line has too many fields"); - exit(1); - } +static void +json_manifest_object_field_start(void *state, char *fname, bool isnull) +{ + JsonManifestParseState *parse = state; - /* Parse the size. */ - filesize = strtoul(field[1], &ep, 10); - if (*ep) + switch (parse->state) { - pg_log_fatal("manifest file size for file \"%s\" is not a number", - field[0]); - exit(1); + case JM_EXPECT_VERSION_FIELD: + if (strcmp(fname, "PostgreSQL-Backup-Manifest-Version") != 0) + json_manifest_parse_failure("expected version indicator"); + parse->state = JM_EXPECT_VERSION_VALUE; + break; + case JM_EXPECT_FILES_FIELD: + if (strcmp(fname, "Files") != 0) + json_manifest_parse_failure("expected file list"); + parse->state = JM_EXPECT_FILES_ARRAY_START; + break; + case JM_EXPECT_THIS_FILE_FIELD: + if (strcmp(fname, "Path") == 0) + parse->field = JMFF_PATH; + else if (strcmp(fname, "Size") == 0) + parse->field = JMFF_SIZE; + else if (strcmp(fname, "Last-Modified") == 0) + parse->field = JMFF_LAST_MODIFIED; + else if (strcmp(fname, "Checksum-Algorithm") == 0) + parse->field = JMFF_CHECKSUM_ALGORITHM; + else if (strcmp(fname, "Checksum") == 0) + parse->field = JMFF_CHECKSUM; + else + json_manifest_parse_failure("unexpected file field"); + parse->state = JM_EXPECT_THIS_FILE_VALUE; + break; + case JM_EXPECT_MANIFEST_CHECKSUM_FIELD: + if (strcmp(fname, "Manifest-Checksum") != 0) + json_manifest_parse_failure("expected manifest checksum"); + parse->state = JM_EXPECT_MANIFEST_CHECKSUM_VALUE; + break; + default: + json_manifest_parse_failure("unexpected object field"); + break; } +} - /* Parse the checksum type. */ - for (s = field[3]; s[0] != '\0' && s[0] != ':'; ++s) - ; - if (*s) - { - raw_checksum_payload = s + 1; - raw_checksum_length = strlen(raw_checksum_payload); - *s = '\0'; - } - if (!pg_checksum_parse_type(field[3], &checksum_type)) - { - pg_log_fatal("unrecognized checksum algorithm for file \"%s\": \"%s\"", - field[0], field[3]); - exit(1); - } +static void +json_manifest_scalar(void *state, char *token, JsonTokenType tokentype) +{ + JsonManifestParseState *parse = state; - /* Decode the checksum payload. */ - checksum_length = raw_checksum_length / 2; - if (checksum_length == 0) - checksum_payload = NULL; - else + switch (parse->state) { - checksum_payload = palloc(checksum_length); - if (!hexdecode_string(checksum_payload, raw_checksum_payload, - checksum_length)) - { - pg_log_fatal("invalid checksum for file \"%s\": \"%s\"", - field[0], raw_checksum_payload); - exit(1); - } + case JM_EXPECT_VERSION_VALUE: + if (strcmp(token, "1") != 0) + json_manifest_parse_failure("unexpected manifest version"); + parse->state = JM_EXPECT_FILES_FIELD; + break; + case JM_EXPECT_THIS_FILE_VALUE: + switch (parse->field) + { + case JMFF_PATH: + parse->pathname = token; + break; + case JMFF_SIZE: + parse->size = token; + break; + case JMFF_LAST_MODIFIED: + pfree(token); /* unused */ + break; + case JMFF_CHECKSUM_ALGORITHM: + parse->algorithm = token; + break; + case JMFF_CHECKSUM: + parse->checksum = token; + break; + } + parse->state = JM_EXPECT_THIS_FILE_FIELD; + break; + case JM_EXPECT_MANIFEST_CHECKSUM_VALUE: + pg_log_info("* manifest_checksum = %s", token); + parse->state = JM_EXPECT_TOPLEVEL_END; + break; + default: + json_manifest_parse_failure("unexpected scalar"); + break; } - - /* Fill the output struct. */ - f->pathname = field[0]; - f->size = filesize; - f->checksum_type = checksum_type; - f->checksum_length = checksum_length; - f->checksum_payload = checksum_payload; } /* @@ -917,125 +976,6 @@ should_ignore_relpath(validator_context *context, char *relpath) return false; } -/* - * Extract a NUL-terminated string from a larger buffer. - */ -static char * -extractstr(char *buffer, int length) -{ - char *s = palloc(length + 1); - - memcpy(s, buffer, length); - s[length] = '\0'; - - return s; -} - -/* - * Find the next instance of a given character within a buffer that - * occurs at or after start_position. If there is none, returns -1; else - * returns the difference between the position at which the character was - * found and the start position. - */ -static int -findchar(char *buffer, int size, char c, int start_position) -{ - int i; - - for (i = start_position; i < size; ++i) - if (buffer[i] == c) - return i - start_position; - return -1; -} - -/* - * Extract the next field from a line of text read from the manifest file. - */ -static int -findfield(char *buffer, char *end, char **result) -{ - int qoffset = 1; - int dqcount = 0; - int toklen; - int bufpos; - int resultpos; - - /* - * If this field is unquoted, we just stop at the next tab; if there's - * none, we stop at the end of the line. Note that if buffer == end, it - * just means that the last field on the line is empty. - */ - if (buffer == end || *buffer != '"') - { - toklen = findchar(buffer, end - buffer, '\t', 0); - - if (toklen == -1) - toklen = end - buffer; - *result = extractstr(buffer, toklen); - return toklen; - } - - /* - * Our escaping convention is that if the field contains a tab, it must be - * surrounded by double-quotes and any internal double-quotes must be - * doubled. - */ - while (1) - { - /* Where's the next double quote? */ - qoffset += findchar(buffer, end - buffer, '"', qoffset); - if (qoffset == -1) - { - pg_log_fatal("quoted field in backup manifest is not terminated"); - exit(1); - } - - /* - * If the double-quote we found is the last character on the line or - * if it's followed by a tab, we've reached the end of this field. - */ - if (buffer + qoffset >= end || buffer[qoffset + 1] == '\t') - break; - - /* Otherwise, the next character should be another double-quote. */ - if (buffer[qoffset + 1] != '"') - { - pg_log_fatal("invalid quoted field in backup manifest"); - exit(1); - } - - /* Skip both double-quotes and go around again. */ - qoffset += 2; - ++dqcount; - } - - /* - * At this point, we know that qoffset is the offset, relative to buffer, - * of the closing double-quote, and that dqcount is the number of escaped - * double-quotes within the field, and that all of those escape sequences - * are proper. Extract and de-escape the data in the field. - * - * The amount of space needed for the result is equal to the raw token - * length, minus two for the double quotes at the start and end, minus one - * for each doubled double-quote within the token, plus one for the - * trailing zero byte. - */ - toklen = qoffset + 1; - *result = palloc(toklen - dqcount - 1); - bufpos = 1; - resultpos = 0; - while (bufpos < qoffset) - { - (*result)[resultpos] = buffer[bufpos]; - bufpos += (buffer[bufpos] == '"' ? 2 : 1); - ++resultpos; - } - (*result)[resultpos] = '\0'; - Assert(resultpos == toklen - dqcount - 2); - - return toklen; -} - /* * Helper function for manifestfiles hash table. */ -- 2.17.2 (Apple Git-113)