diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c index 28c975446e..3eaa72ab31 100644 --- a/src/bin/pg_verify_checksums/pg_verify_checksums.c +++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c @@ -1,7 +1,7 @@ /* * pg_verify_checksums * - * Verifies page level checksums in an offline cluster + * Verifies page level checksums in a cluster * * Copyright (c) 2010-2018, PostgreSQL Global Development Group * @@ -38,7 +38,7 @@ static const char *progname; static void usage() { - printf(_("%s verifies page level checksums in offline PostgreSQL database cluster.\n\n"), progname); + printf(_("%s verifies page level checksums in PostgreSQL database cluster.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION] [DATADIR]\n"), progname); printf(_("\nOptions:\n")); @@ -57,6 +57,7 @@ static const char *skip[] = { "pg_filenode.map", "pg_internal.init", "PG_VERSION", + "pgsql_tmp", NULL, }; @@ -82,6 +83,7 @@ scan_file(char *fn, int segmentno) PageHeader header = (PageHeader) buf; int f; int blockno; + bool block_retry = false; f = open(fn, 0); if (f < 0) @@ -115,14 +117,56 @@ scan_file(char *fn, int segmentno) csum = pg_checksum_page(buf, blockno + segmentno * RELSEG_SIZE); if (csum != header->pd_checksum) { + /* + * Retry the block on the first failure. It's + * possible that we read the first 4K page of + * the block just before postgres updated the + * entire block so it ends up looking torn to + * us. If there were less than 10 bad blocks so + * far, we wait for 500ms before we retry. + */ + if (block_retry == false) + { + if (badblocks < 10) + pg_usleep(500); + + /* Seek to the beginning of the failed block */ + if (lseek(f, -BLCKSZ, SEEK_CUR) == -1) + { + fprintf(stderr, _("%s: could not lseek in file \"%s\": %m\n"), + progname, fn); + exit(1); + } + + /* Set flag so we know a retry was attempted */ + block_retry = true; + + if (debug) + fprintf(stderr, _("%s: checksum verification failed on first attempt in file \"%s\", block %d: calculated checksum %X but expected %X\n"), + progname, fn, blockno, csum, header->pd_checksum); + + /* Reset loop to validate the block again */ + blockno--; + + continue; + } + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %d: calculated checksum %X but expected %X\n"), progname, fn, blockno, csum, header->pd_checksum); badblocks++; } else if (debug) - fprintf(stderr, _("%s: checksum verified in file \"%s\", block %d: %X\n"), - progname, fn, blockno, csum); + { + if (block_retry) + fprintf(stderr, _("%s: block %d in file \"%s\" verified ok on recheck\n"), + progname, blockno, fn); + else + fprintf(stderr, _("%s: checksum verified in file \"%s\", block %d: %X\n"), + progname, fn, blockno, csum); + } + + block_retry = false; } close(f); @@ -154,6 +198,15 @@ scan_directory(char *basedir, char *subdir) snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); if (lstat(fn, &st) < 0) { + if (errno == ENOENT) + { + /* File was removed in the meantime */ + if (debug) + fprintf(stderr, _("%s: ignoring deleted file \"%s\"\n"), + progname, fn); + continue; + } + fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), progname, fn, strerror(errno)); exit(1); @@ -284,7 +337,7 @@ main(int argc, char *argv[]) exit(1); } - /* Check if cluster is running */ + /* Check if checksums are enabled */ ControlFile = get_controlfile(DataDir, progname, &crc_ok); if (!crc_ok) { @@ -292,13 +345,6 @@ main(int argc, char *argv[]) exit(1); } - if (ControlFile->state != DB_SHUTDOWNED && - ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) - { - fprintf(stderr, _("%s: cluster must be shut down to verify checksums.\n"), progname); - exit(1); - } - if (ControlFile->data_checksum_version == 0) { fprintf(stderr, _("%s: data checksums are not enabled in cluster.\n"), progname);