From 9d285112eaa14a1226acd48a5f1e8ea0d51c2fc6 Mon Sep 17 00:00:00 2001 From: Suraj Kharage Date: Fri, 20 Dec 2019 16:19:44 +0530 Subject: [PATCH v7 2/3] Implementation of backup validator Patch by Suraj Kharage, inputs from Robert Haas, review from Jeevan Chalke, and Robert Haas. --- doc/src/sgml/ref/pg_basebackup.sgml | 12 + src/bin/pg_basebackup/pg_basebackup.c | 516 ++++++++++++++++++++++++++++++++++ src/tools/pgindent/typedefs.list | 2 + 3 files changed, 530 insertions(+) diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index af7c731..cb11f74 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -548,6 +548,18 @@ PostgreSQL documentation + + + + + Validate the given backup directory and detect the modification if any + without restarting the server. For plain backup, provide the backup + directory path with option. Tar format + backups can be verified after untarring. + + + + diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index ca9ccb9..e90e48e 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -27,9 +27,12 @@ #endif #include "access/xlog_internal.h" +#include "common/checksum_utils.h" +#include "common/encode.h" #include "common/file_perm.h" #include "common/file_utils.h" #include "common/logging.h" +#include "common/sha2.h" #include "common/string.h" #include "fe_utils/recovery_gen.h" #include "fe_utils/string_utils.h" @@ -97,6 +100,30 @@ typedef struct WriteManifestState typedef void (*WriteDataCallback) (size_t nbytes, char *buf, void *callback_data); +typedef struct DataDirectoryFileInfo +{ + char *filename; + int filesize; + char *checksum; + bool matched; + uint32 status; /* hash status */ +} DataDirectoryFileInfo; + +/* This is declared in advance because it is required for manifesthash */ +static uint32 string_hash_sdbm(const char *key); + +#define SH_PREFIX manifesthash +#define SH_ELEMENT_TYPE DataDirectoryFileInfo +#define SH_KEY_TYPE char* +#define SH_KEY filename +#define SH_HASH_KEY(tb, key) string_hash_sdbm(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_RAW_ALLOCATOR pg_malloc +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + /* * pg_xlog has been renamed to pg_wal in version 10. This version number * should be compared with PQserverVersion(). @@ -201,6 +228,19 @@ static bool reached_end_position(XLogRecPtr segendpos, uint32 timeline, static const char *get_tablespace_mapping(const char *dir); static void tablespace_list_append(const char *arg); +static void VerifyBackup(void); +static manifesthash_hash *CreateManifestHash(char *manifest_path, + ChecksumAlgorithm *checksum_type); +static void VerifyDir(char *basepath, char *subdirpath, + manifesthash_hash *hashtab, + ChecksumAlgorithm checksum_type); +static void VerifyFile(char *filepath, struct stat st, + char *basepath, + manifesthash_hash *hashtab, + ChecksumAlgorithm checksum_type); +static char *NextLine(char *buf, char *endptr); +static char *NextWord(char *line, char ch, char *endlineptr); +static char *ReadFileIntoBuffer(char *filename, int *length); static void cleanup_directories_atexit(void) @@ -401,6 +441,7 @@ usage(void) " do not verify checksums\n")); printf(_(" --manifest-checksums=SHA256|CRC32C|NONE\n" " calculate checksums for manifest files using provided algorithm\n")); + printf(_(" --verify-backup validate the backup\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=CONNSTR connection string\n")); @@ -2167,11 +2208,13 @@ main(int argc, char **argv) {"no-slot", no_argument, NULL, 2}, {"no-verify-checksums", no_argument, NULL, 3}, {"manifest-checksums", required_argument, NULL, 'm'}, + {"verify-backup", no_argument, NULL, 4}, {NULL, 0, NULL, 0} }; int c; int option_index; + bool verify_backup = false; pg_logging_init(argv[0]); progname = get_progname(argv[0]); @@ -2338,6 +2381,9 @@ main(int argc, char **argv) case 'm': manifest_checksums = pg_strdup(optarg); break; + case 4: + verify_backup = true; + break; default: /* @@ -2460,6 +2506,13 @@ main(int argc, char **argv) } #endif + /* --verify-backup option is specified, validate the backup */ + if (verify_backup) + { + VerifyBackup(); + exit(0); + } + /* connection in replication mode to server */ conn = GetConnection(); if (!conn) @@ -2524,3 +2577,466 @@ main(int argc, char **argv) success = true; return 0; } + +static void +VerifyBackup(void) +{ + char manifest_path[MAXPGPATH]; + manifesthash_hash *hashtab; + manifesthash_iterator i; + DataDirectoryFileInfo *entry; + ChecksumAlgorithm checksum_type; + + snprintf(manifest_path, sizeof(manifest_path), "%s/%s", basedir, + "backup_manifest"); + + /* create hash table */ + hashtab = CreateManifestHash(manifest_path, &checksum_type); + + VerifyDir(basedir, "", hashtab, checksum_type); + + manifesthash_start_iterate(hashtab, &i); + while ((entry = manifesthash_iterate(hashtab, &i)) != NULL) + { + if (!entry->matched) + pg_log_info("file \"%s\" is present in manifest but missing from the backup", + entry->filename); + } +} + +/* + * Given a file path, read that file into buffer, parse that buffer line by + * line and generate the hash table for each line. Also, generate the SHA256 + * checksum for the records that are read from buffer and compare that with + * manifest checksum written in backup_manifest file. If both checksums are + * identical then proceed, otherwise throw an error and abort. Set the + * checksum type of manifest record to out parameter checksum_type. + */ +static manifesthash_hash * +CreateManifestHash(char *manifest_path, ChecksumAlgorithm *checksum_type) +{ + manifesthash_hash *hashtab; + DataDirectoryFileInfo *entry; + char *buf; + ChecksumCtx cCtx; + int length; + char *endptr; + int numlines = 0; + + buf = ReadFileIntoBuffer(manifest_path, &length); + endptr = buf + length; + + hashtab = manifesthash_create(1024, NULL); + + /* Read the first line of buffer */ + if (buf < endptr) + { + char *headerline; + int headerlength; + + /* + * Read the header from file, here header_line is pointing to start of + * file. Advanced the buffer to next line and then buf - header_line + * will give us the header length. + */ + headerline = buf; + buf = NextLine(buf, endptr); + headerlength = buf - headerline; + + numlines++; + + /* + * Initialize the checksum for the first time. Since checksum for the + * manifest file is always generated with SHA256 so initializing with + * SHA256. + */ + initialize_checksum(&cCtx, MC_SHA256); + + /* feed the header to the checksum machinery */ + update_checksum(&cCtx, MC_SHA256, headerline, headerlength); + } + + /* + * Once we read the header, then read the buffer line by line and check + * whether it is a File record or Manifest-Checksum entry and parse + * accordingly. + */ + while (buf < endptr) + { + int length; + char *line; + char *word; + int wordlength; + char *endlineptr; + char *nextword; + + line = buf; + /* read the next line and calculate the length for the line */ + buf = NextLine(buf, endptr); + length = buf - line; + + /* calculate the end of line */ + endlineptr = line + length; + + numlines++; + + /* parse the first word of line */ + word = line; + nextword = NextWord(line, '\t', endlineptr); + wordlength = nextword - word - 1; + + /* + * If it is a File record, then parse it into fields. With this we + * will get the filename, checksum and size. + */ + if (strncmp(word, "File", wordlength) == 0) + { + char *size; + long filesize; + bool found; + long filelength, + sizelength; + char *filename; + char *checksumlabel; + char *checksum; + ChecksumAlgorithm currentchecksumtype; + + /* + * feed line to checksum machinery as it is a FILE type manifest + * record + */ + update_checksum(&cCtx, MC_SHA256, line, length); + + line[length - 1] = '\0'; + + /* parse the filename */ + filename = nextword; + size = NextWord(filename, '\t', endlineptr); + filelength = size - filename; + filename[filelength - 1] = '\0'; + + /* parse the filesize and convert it to long */ + word = NextWord(size, '\t', endlineptr); + sizelength = word - size; + size[sizelength - 1] = '\0'; + filesize = strtol(size, NULL, 10); + + /* skip mtime field */ + checksum = NextWord(word, '\t', endlineptr); + + /* + * parse the checksum field. If it is a "-" that means no + * checksum. Otherwise split this field by ":" character to + * identify the checksum type. + */ + if (strcmp(checksum, "-") == 0) + currentchecksumtype = MC_NONE; + else + { + checksumlabel = checksum; + checksum = NextWord(checksum, ':', endlineptr); + length = checksum - checksumlabel; + checksumlabel[length - 1] = '\0'; + + if (!parse_checksum_algorithm(checksumlabel, ¤tchecksumtype)) + { + pg_log_error("manifest file \"%s\" specifies unknown checksum algorithm \"%s\" at line %d", + manifest_path, checksumlabel, numlines); + exit(1); + } + } + + /* + * All the manifest records should have same checksum type. Error + * out if we find checksum mismatch between manifest records. + */ + if (numlines > 2 && *checksum_type != currentchecksumtype) + { + pg_log_error("manifest file \"%s\" specifies different checksum algorithm \"%s\" at line %d", + manifest_path, checksumlabel, numlines); + exit(1); + } + else + *checksum_type = currentchecksumtype; + + /* insert the hash record */ + entry = manifesthash_insert(hashtab, filename, &found); + entry->filesize = filesize; + entry->checksum = checksum; + } + else if (strncmp(word, "Manifest-Checksum", wordlength) == 0) + { + char checksumbuf[256]; + int checksumbuflen; + char encodedchecksum[256]; + char *checksumlabel; + char *checksum; + + line[length - 1] = '\0'; + + /* parse the checksum label for backup manifest checksum */ + checksumlabel = nextword; + checksum = NextWord(nextword, ':', endlineptr); + length = checksum - checksumlabel; + checksumlabel[length - 1] = '\0'; + + if (strcmp(checksumlabel, "SHA256") != 0) + { + pg_log_error("manifest file \"%s\" specifies unknown manifest checksum algorithm \"%s\" at line %d", + manifest_path, checksumlabel, numlines); + exit(1); + } + + /* finalize the checksum */ + checksumbuflen = finalize_checksum(&cCtx, MC_SHA256, + (char *) checksumbuf); + checksumbuflen = hex_encode(checksumbuf, checksumbuflen, + encodedchecksum); + encodedchecksum[checksumbuflen] = '\0'; + + if (strcmp(encodedchecksum, checksum) != 0) + { + pg_log_error("manifest file \"%s\" has manifest checksum \"%s\" but calculated manifest checksum is \"%s\"", + manifest_path, checksum, encodedchecksum); + exit(1); + } + } + else + { + word[wordlength] = '\0'; + pg_log_error("manifest file \"%s\" contains invalid keyword \"%s\" at line %d", + manifest_path, word, numlines); + exit(1); + } + } + return hashtab; +} + +/* + * Verify all files from the given directory. Scans the given directory + * and for each regular file within that directory, calls VerifyFile() + * for the verification. + */ +static void +VerifyDir(char *basepath, char *pathsuffix, manifesthash_hash *hashtab, + ChecksumAlgorithm checksum_type) +{ + char path[MAXPGPATH]; + DIR *dir; + struct dirent *de; + + snprintf(path, MAXPGPATH, "%s%s", basepath, pathsuffix); + + dir = opendir(path); + if (!dir) + { + pg_log_error("could not open directory \"%s\": %m", path); + exit(1); + } + + while ((de = readdir(dir)) != NULL) + { + char fn[MAXPGPATH]; + char newpathsuffix[MAXPGPATH]; + struct stat st; + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(newpathsuffix, MAXPGPATH, "%s/%s", pathsuffix, de->d_name); + + /* + * Ignore the possible presence of a backup_manifest file and/or a + * pg_wal directory in the backup being verified. A backup_manifest + * file generated while backup, does not have entry for + * backup_manifest itself and files in pg_wal directory, so skipping + * those would be right here. + */ + if (strcmp(newpathsuffix, "/pg_wal") == 0 || + strcmp(newpathsuffix, "/backup_manifest") == 0) + continue; + + snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); + if (stat(fn, &st) < 0) + { + pg_log_error("could not stat file \"%s\": %m", fn); + exit(1); + } + if (S_ISREG(st.st_mode)) + VerifyFile(fn, st, basepath, hashtab, checksum_type); + else if (S_ISDIR(st.st_mode)) + VerifyDir(basepath, newpathsuffix, hashtab, checksum_type); + } + closedir(dir); +} + +/* + * Given the file and its details, check whether it is present in hash table + * and if yes, then compare its details with hash table entry. + */ +static void +VerifyFile(char *filepath, struct stat st, char *basepath, + manifesthash_hash *hashtab, ChecksumAlgorithm checksum_type) +{ + DataDirectoryFileInfo *record; + FILE *fp; + char buf[1048576]; /* 1MB chunk */ + pgoff_t len = 0; + off_t cnt; + char checksumbuf[256]; + char encodedchecksumbuf[256]; + int checksumbuflen; + ChecksumCtx cCtx; + char *relativefilepath; + + /* + * Since backup manifest contains the relative path of file as a filename, + * we need that relative path to lookup into hash table. + */ + relativefilepath = filepath + strlen(basepath) + 1; + + /* + * Lookup into hash table and if record found then we match the file size + * and checksum (if enabled). Modified time cannot be compared with the + * file in the backup directory and its entry in the manifest as manifest + * entry gives mtime from server file whereas the same file in the backup + * will have different mtime. + */ + record = manifesthash_lookup(hashtab, relativefilepath); + if (record == NULL) + { + pg_log_info("file \"%s\" is present in backup but not in manifest", + relativefilepath); + return; + } + + record->matched = true; + if (record->filesize != st.st_size) + pg_log_info("file \"%s\" has size %d in manifest but size %lu in backup", + relativefilepath, record->filesize, st.st_size); + + if (checksum_type == MC_NONE) + return; + + /* + * If checksum_type is other than MC_NONE then generate the checksum based + * on checksum_type. + */ + initialize_checksum(&cCtx, checksum_type); + + fp = fopen(filepath, "r"); + if (!fp) + { + pg_log_error("could not open file \"%s\": %m", filepath); + exit(1); + } + + /* Read file in chunks [1 MB each chunk] */ + while ((cnt = fread(buf, 1, Min(sizeof(buf), st.st_size - len), fp)) > 0) + { + update_checksum(&cCtx, checksum_type, buf, cnt); + len += cnt; + } + + checksumbuflen = finalize_checksum(&cCtx, checksum_type, checksumbuf); + + /* Convert checksum to hexadecimal. */ + checksumbuflen = hex_encode(checksumbuf, checksumbuflen, + encodedchecksumbuf); + encodedchecksumbuf[checksumbuflen] = '\0'; + + fclose(fp); + + /* compare the generated checksum with the checksum present in hash entry */ + if (strcmp(record->checksum, encodedchecksumbuf) != 0) + pg_log_info("file \"%s\" has checksum %s in manifest but checksum %s in backup", + relativefilepath, record->checksum, encodedchecksumbuf); +} + +/* + * Find out the next new line character from the provided string and return + * char pointer pointing to next character after that. + */ +static char * +NextLine(char *buf, char *endptr) +{ + while (*buf != '\n' && buf < endptr) + buf++; + + return ++buf; +} + +/* + * Advance the string until provided character is detected. Return the + * advanced string. + */ +static char * +NextWord(char *line, char ch, char *endlineptr) +{ + while (*line != ch && line < endlineptr) + line++; + + return ++line; +} + +/* Read the given file into buffer and return that buffer */ +static char * +ReadFileIntoBuffer(char *filename, int *length) +{ + int fd; + char *buf; + struct stat stat; + + fd = open(filename, O_RDONLY, 0); + if (fd < 0) + { + pg_log_error("could not open file \"%s\": %m", filename); + exit(1); + } + + if (fstat(fd, &stat)) + { + pg_log_error("could not stat file \"%s\": %m", filename); + close(fd); + exit(1); + } + + buf = pg_malloc(stat.st_size + 1); + + *length = read(fd, buf, stat.st_size); + if (*length != stat.st_size) + { + if (*length < 0) + pg_log_error("could not read file \"%s\": %m", filename); + else + pg_log_error("could not read file \"%s\": read %d of %lu", filename, + *length, (unsigned long) stat.st_size); + + close(fd); + exit(1); + } + + buf[*length] = '\0'; + + close(fd); + + return buf; +} + +/* + * Simple string hash function from http://www.cse.yorku.ca/~oz/hash.html + * + * The backend uses a more sophisticated function for hashing strings, + * but we don't really need that complexity here. + */ +uint32 +string_hash_sdbm(const char *key) +{ + uint32 hash = 0; + int c; + + while ((c = *key++)) + hash = c + (hash << 6) + (hash << 16) - hash; + + return hash; +} diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 87556f6..14e475e 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -487,6 +487,7 @@ DR_sqlfunction DR_transientrel DSA DWORD +DataDirectoryFileInfo DataDumperPtr DataPageDeleteStack DateADT @@ -1353,6 +1354,7 @@ MultiXactOffset MultiXactStateData MultiXactStatus MyData +manifesthash_hash manifestinfo NDBOX NODE -- 1.8.3.1