From 02e9c308cd435e266dda95186127932bc2f50965 Mon Sep 17 00:00:00 2001 From: David Christensen Date: Wed, 20 Apr 2022 19:59:35 -0500 Subject: [PATCH v6] Teach pg_waldump to extract FPIs from the WAL stream Extracts full-page images from the WAL stream into a target directory, which must be empty or not exist. These images are subject to the same filtering rules as normal display in pg_waldump, which means that you can isolate the full page writes to a target relation, among other things. Files are saved with the filename: .... with formatting to make things somewhat sortable; for instance: 00000000-010000C0.1663.1.6117.0 00000000-01000150.1664.0.6115.0 00000000-010001E0.1664.0.6114.0 00000000-01000270.1663.1.6116.0 00000000-01000300.1663.1.6113.0 00000000-01000390.1663.1.6112.0 00000000-01000420.1663.1.8903.0 00000000-010004B0.1663.1.8902.0 00000000-01000540.1663.1.6111.0 00000000-010005D0.1663.1.6110.0 If the FPI comes from a fork other than the main fork, the fork name will be appended on the output file name; e.g.: 00000000-014A4758.1663.1.12864.0_vm It's noteworthy that the raw block images do not have the current LSN stored with them in the WAL stream (as would be true for on-heap versions of the blocks), nor would the checksum be updated in them (though WAL itself has checksums, so there is some protection there). As such there are two versions of this functionality, one which returns the raw page as it appears in the WAL (--save-fpi) and one which applies the updated pd_lsn and pd_checksum (--fixup-fpi). These images could be loaded/inspected via `pg_read_binary_file()` and used in the `pageinspect` suite of tools to perform detailed analysis on the pages in question, based on historical information, and may come in handy for forensics work. --- doc/src/sgml/ref/pg_waldump.sgml | 79 ++++++++++++ src/bin/pg_waldump/meson.build | 1 + src/bin/pg_waldump/pg_waldump.c | 147 +++++++++++++++++++++- src/bin/pg_waldump/t/002_save_fullpage.pl | 142 +++++++++++++++++++++ 4 files changed, 368 insertions(+), 1 deletion(-) create mode 100644 src/bin/pg_waldump/t/002_save_fullpage.pl diff --git a/doc/src/sgml/ref/pg_waldump.sgml b/doc/src/sgml/ref/pg_waldump.sgml index d559f091e5..ecc973e86b 100644 --- a/doc/src/sgml/ref/pg_waldump.sgml +++ b/doc/src/sgml/ref/pg_waldump.sgml @@ -240,6 +240,85 @@ PostgreSQL documentation + + + + + + + + Save full page images seen in the WAL stream to the + save_path directory, which will be created + if it does not exist. The images saved will be subject to the same + filtering and limiting criteria as display records, but in this + mode pg_waldump will not output any other + information. + + + If invoked using + the -X/--fixup-fpi + option, this page image will include the pd_lsn of + the replayed record rather than the raw page image; as well, + the pd_checksum field will be updated if it had + previously existed. + + + The page images will be saved with the file + format: LSN.RELTABLESPACE.DATOID.RELNODE.BLKNOFORK + + The dot-separated components are (in order): + + + + + + Component + Description + + + + + + LSN + The LSN of the record with this block, formatted + as two 8-character hexadecimal numbers %08X-%08X + + + + RELTABLESPACE + tablespace OID for the block + + + + DATOID + database OID for the block + + + + RELNODE + relnode id for the block + + + + BLKNO + the block number of this block + + + + FORK + + if coming from the main fork, will be empty, otherwise will be + one of _fsm, _vm, + or _init. + + + + + + + + + diff --git a/src/bin/pg_waldump/meson.build b/src/bin/pg_waldump/meson.build index 9605976870..34e37bffc3 100644 --- a/src/bin/pg_waldump/meson.build +++ b/src/bin/pg_waldump/meson.build @@ -29,6 +29,7 @@ tests += { 'tap': { 'tests': [ 't/001_basic.pl', + 't/002_save_fullpage.pl', ], }, } diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 9993378ca5..e35d15132f 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -24,8 +24,12 @@ #include "access/xlogstats.h" #include "common/fe_memutils.h" #include "common/logging.h" +#include "common/relpath.h" #include "getopt_long.h" #include "rmgrdesc.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" /* * NOTE: For any code change or issue fix here, it is highly recommended to @@ -70,6 +74,10 @@ typedef struct XLogDumpConfig bool filter_by_relation_block_enabled; ForkNumber filter_by_relation_forknum; bool filter_by_fpw; + + /* save options */ + bool fixup_fpw; + char *save_fpw_path; } XLogDumpConfig; @@ -439,6 +447,107 @@ XLogRecordHasFPW(XLogReaderState *record) return false; } +/* + * Function to externally save all FPWs stored in the given WAL record + */ +static void +XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath, bool fixup) +{ + int block_id; + + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + char page[BLCKSZ] = {0}; + char filename[MAXPGPATH]; + char forkname[FORKNAMECHARS + 2]; /* _ + \0 */ + FILE *OPF; + BlockNumber blk; + RelFileLocator rnode; + ForkNumber fork; + + if (!XLogRecHasBlockRef(record, block_id)) + continue; + + if (!XLogRecHasBlockImage(record, block_id)) + continue; + + if (!RestoreBlockImage(record, block_id, page)) + continue; + + /* we have our extracted FPI, let's save it now */ + + XLogRecGetBlockTagExtended(record, block_id, + &rnode, &fork, &blk, NULL); + + /* + * The raw page as stored in the WAL record includes the LSN + * of the block as it appeared when it was originally written, + * however this differs than the effects of replaying this + * same FPI in recovery, as recovery calls RestoreBlockImage() + * and then sets the LSN as part of one action. What this + * means is that a page as recovered from WAL and the version + * of the page saved here will differ by the LSN and the + * checksum (if enabled). + * + * There are potentially use-cases for both versions (with and + * without mentioned fixups), so allow this to be + * user-selected, unless the restored page was empty, in which + * case we leave it alone. + */ + + if (fixup && !PageIsNew(page)) + { + PageSetLSN(page, record->ReadRecPtr); + + /* + * If checksum field is non-zero then we have checksums + * enabled, so recalculate the checksum with new LSN + * (whether this is considered a hack or heuristics is an + * exercise for the reader). + * + * We make this choice to allow pages saved by this + * function to work as expected with the checksum-related + * functions in pageinspect without having to worry about + * zero_damaged_pages or other considerations. + * + * FPIs in WAL do not have the checksum field updated in + * the page image; in a checksums-enabled cluster, this + * task is handled by FlushBuffer() when a dirty buffer is + * written out to disk. Since we are running outside of + * Postmaster that won't work in this case, so we handle + * ourselves. + */ + + if (((PageHeader) page)->pd_checksum) + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blk); + } + + if (fork >= 0 && fork <= MAX_FORKNUM) + { + if (fork) + sprintf(forkname, "_%s", forkNames[fork]); + else + forkname[0] = 0; + } + else + pg_fatal("found invalid fork number: %u", fork); + + snprintf(filename, MAXPGPATH, "%s/%08X-%08X.%u.%u.%u.%u%s", savepath, + LSN_FORMAT_ARGS(record->ReadRecPtr), + rnode.spcOid, rnode.dbOid, rnode.relNumber, blk, forkname); + + OPF = fopen(filename, PG_BINARY_W); + if (!OPF) + pg_fatal("couldn't open file for output: %s", filename); + + if (pg_pwrite(fileno(OPF), page, BLCKSZ, 0) != BLCKSZ) + pg_fatal("couldn't write out complete fullpage image to file: %s", filename); + + fsync(fileno(OPF)); + fclose(OPF); + } +} + /* * Print a record to stdout */ @@ -679,6 +788,9 @@ usage(void) " (default: 1 or the value used in STARTSEG)\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -w, --fullpage only show records with a full page write\n")); + printf(_(" -W, --save-fpi=path save full page images to given path as\n" + " LSN.T.D.R.B_F\n")); + printf(_(" -X, --fixup-fpi=path like --save-fpi but apply LSN fixups to saved page\n")); printf(_(" -x, --xid=XID only show records with transaction ID XID\n")); printf(_(" -z, --stats[=record] show statistics instead of records\n" " (optionally, show per-record statistics)\n")); @@ -712,6 +824,8 @@ main(int argc, char **argv) {"limit", required_argument, NULL, 'n'}, {"path", required_argument, NULL, 'p'}, {"quiet", no_argument, NULL, 'q'}, + {"save-fpi", required_argument, NULL, 'W'}, + {"fixup-fpi", required_argument, NULL, 'X'}, {"relation", required_argument, NULL, 'R'}, {"rmgr", required_argument, NULL, 'r'}, {"start", required_argument, NULL, 's'}, @@ -772,6 +886,8 @@ main(int argc, char **argv) config.filter_by_fpw = false; config.stats = false; config.stats_per_record = false; + config.fixup_fpw = false; + config.save_fpw_path = NULL; stats.startptr = InvalidXLogRecPtr; stats.endptr = InvalidXLogRecPtr; @@ -782,7 +898,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wW:x:X:z", long_options, &optindex)) != -1) { switch (option) @@ -918,6 +1034,11 @@ main(int argc, char **argv) case 'w': config.filter_by_fpw = true; break; + case 'W': + case 'X': + config.fixup_fpw = (option == 'X'); + config.save_fpw_path = pg_strdup(optarg); + break; case 'x': if (sscanf(optarg, "%u", &config.filter_by_xid) != 1) { @@ -972,6 +1093,25 @@ main(int argc, char **argv) } } + if (config.save_fpw_path != NULL) + { + int dir_status = pg_check_dir(config.save_fpw_path); + + if (dir_status < 0) + { + pg_log_error("could not access output directory: %s", config.save_fpw_path); + goto bad_argument; + } + + /* Create the dir if it doesn't exist */ + if (dir_status == 0 && mkdir(config.save_fpw_path, 0700) < 0) + { + pg_log_error("could not create output directory \"%s\": %m", + config.save_fpw_path); + goto bad_argument; + } + } + /* parse files as start/end boundaries, extract path if not specified */ if (optind < argc) { @@ -1150,6 +1290,11 @@ main(int argc, char **argv) XLogRecStoreStats(&stats, xlogreader_state); stats.endptr = xlogreader_state->EndRecPtr; } + else if (config.save_fpw_path) + { + if (XLogRecordHasFPW(xlogreader_state)) + XLogRecordSaveFPWs(xlogreader_state, config.save_fpw_path, config.fixup_fpw); + } else XLogDumpDisplayRecord(&config, xlogreader_state); } diff --git a/src/bin/pg_waldump/t/002_save_fullpage.pl b/src/bin/pg_waldump/t/002_save_fullpage.pl new file mode 100644 index 0000000000..c98100bf21 --- /dev/null +++ b/src/bin/pg_waldump/t/002_save_fullpage.pl @@ -0,0 +1,142 @@ + +# Copyright (c) 2022, PostgreSQL Global Development Group + +use strict; +use warnings; +use File::Basename; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::RecursiveCopy; +use PostgreSQL::Test::Utils; +use Test::More; + +my $blocksize = 8192; + +# routine to extract the LSN and checksum from the given block structure +sub get_block_info +{ + my $path = shift; + my $block; + + open my $fh, '<', $path or die "couldn't open file: $path\n"; + die "couldn't read full block\n" if $blocksize != read $fh, $block, $blocksize; + my ($lsn_hi, $lsn_lo, $checksum) = unpack('VVv', $block); + + $lsn_hi = sprintf('%08X', $lsn_hi); + $lsn_lo = sprintf('%08X', $lsn_lo); + + return ($lsn_hi, $lsn_lo, $checksum); +} + +# Set umask so test directories and files are created with default permissions +umask(0077); + +my $primary = PostgreSQL::Test::Cluster->new('primary'); +$primary->init('-k'); +$primary->append_conf('postgresql.conf', "max_wal_size='100MB'"); +$primary->append_conf('postgresql.conf', "wal_level='replica'"); +$primary->append_conf('postgresql.conf', 'archive_mode=on'); +$primary->append_conf('postgresql.conf', "archive_command='/bin/false'"); +$primary->start; + +# Sanity checks for command line options. +$primary->command_fails( + [ 'pg_waldump', '--save-fpi' ], + '--save-fpi fails without path'); +$primary->command_fails( + [ 'pg_waldump', '--fixup-fpi' ], + '--fixup-fpi fails without path'); + +# generate data/wal to examine +$primary->safe_psql('postgres', q(CREATE DATABASE db1)); +$primary->safe_psql('db1', <safe_psql('postgres', "SELECT current_setting('block_size')"); + +# get the relation node, etc for the new table +my $relation = $primary->safe_psql('db1', + q{SELECT format('%s/%s/%s', CASE WHEN reltablespace = 0 THEN dattablespace ELSE reltablespace END, pg_database.oid, pg_relation_filenode(pg_class.oid)) FROM pg_class, pg_database WHERE relname = 'test_table' AND datname = current_database()} +); + +diag $relation; + +$primary->stop; +my $waldir = $primary->basedir . '/pgdata/pg_wal'; +my $walfile = [glob("$waldir/*")]->[2]; # we want the 00000002 file +my $tmp_folder = PostgreSQL::Test::Utils::tempdir; +diag "using walfile: $walfile"; + +# # # extract files +# PostgreSQL::Test::RecursiveCopy::copypath( +# $primary->data_dir . "/pg_wal", +# $primary->tmp_folder); + +$primary->command_ok(['pg_waldump', '--save-fpi', "$tmp_folder/raw", '--relation', $relation, $walfile]); +$primary->command_ok(['pg_waldump', '--fixup-fpi', "$tmp_folder/fixup", '--relation', $relation, $walfile]); + +my $file_re = + qr/^([0-9A-F]{8})-([0-9A-F]{8})[.][0-9]+[.][0-9]+[.][0-9]+[.][0-9]+(?:_vm|_init|_fsm)?$/; + +my %checksums; +my %files; + +# verify filename formats matches w/--save-fpi +for my $fullpath (glob "$tmp_folder/raw/*") +{ + my $file = File::Basename::basename($fullpath); + + like($file, $file_re, "verify filename format for file $file"); + + # save filename for later verification + $files{$file}++; + + my ($hi_lsn_fn, $lo_lsn_fn) = ($file =~ $file_re); + my ($hi_lsn_bk, $lo_lsn_bk, $checksum) = get_block_info($fullpath); + + # since no fixup, verify the lsn in the block comes before the file's lsn + ok( $hi_lsn_fn . $lo_lsn_fn gt $hi_lsn_bk . $lo_lsn_bk, + 'verify file-based LSN precedes block-based one'); + + # stash checksum for later comparisons + $checksums{$file} = $checksum; +} + +# verify filename formats matches w/--fixup-fpi +for my $fullpath (glob "$tmp_folder/fixup/*") +{ + my $file = File::Basename::basename($fullpath); + + like($file, $file_re, "verify filename format for file $file"); + + # save filename for later verification + $files{$file}++; + + my ($hi_lsn_fn, $lo_lsn_fn) = ($file =~ $file_re); + my ($hi_lsn_bk, $lo_lsn_bk, $checksum) = get_block_info($fullpath); + + # since fixup, verify the lsn in the block equals file lsn + ok( $hi_lsn_fn . $lo_lsn_fn eq $hi_lsn_bk . $lo_lsn_bk, + 'verify file-based LSN is the same as block-based one'); + + # verify checksum change; XXX: there could be valid clashes here, + # just validate that the page matches the expected checksum instead? + ok( $checksum == 0 || $checksums{$file} != $checksum, + 'check for checksum change or no checksum'); +} + +# validate that we ended up with some files output and they were the same +ok(keys %files > 0, 'verify we processed some files'); +ok((grep { $_ != 2 } values %files) == 0, + 'ensure raw and fixup had same number of files'); + +done_testing(); -- 2.37.1 (Apple Git-137.1)