diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 2a99a28..6fd1460 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -9,10 +9,15 @@ #include "postgres_fe.h" +#include "access/visibilitymap.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" #include +#define BITS_PER_HEAPBLOCK_OLD 1 #ifndef WIN32 @@ -21,6 +26,7 @@ static int copy_file(const char *fromfile, const char *tofile, bool force); static int win32_pghardlink(const char *src, const char *dst); #endif +static bool checkRewriteVisibilityMap(const char *oldfile, const char *newfile); /* * copyFile() @@ -138,6 +144,235 @@ copy_file(const char *srcfile, const char *dstfile, bool force) #endif +/* + * rewriteVisibilityMap() + * + * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's + * visibility map included one bit per heap page; it now includes two. + * When upgrading a cluster from before that time to a current PostgreSQL + * version, we could refuse to copy visibility maps from the old cluster + * to the new cluster; the next VACUUM would recreate them, but at the + * price of scanning the entire table. So, instead, we rewrite the old + * visibility maps in the new format. That way, the all-visible bit + * remains set for the pages for which it was set previously. The + * all-frozen bit is never set by this conversion; we leave that to + * VACUUM. + */ +const char * +rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force) +{ + int src_fd = 0; + int dst_fd = 0; + char buffer[BLCKSZ]; + ssize_t bytesRead; + int rewriteVmBytesPerPage; + BlockNumber blkno = 0; + + /* Compute we need how many old page bytes to rewrite a new page */ + rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; + + if ((fromfile == NULL) || (tofile == NULL)) + return "Invalid old file or new file"; + + if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0) + return getErrorText(); + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0) + { + close(src_fd); + return getErrorText(); + } + + /* + * Turn each visibility map page into 2 pages one by one. + * Rewritten 2 pages have same page header as old page had. + */ + while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ) + { + char *old_cur, *old_break, *old_blkend; + PageHeaderData pageheader; + + /* Save the page header data */ + memcpy(&pageheader, buffer, SizeOfPageHeaderData); + + /* + * These old_* variables point to old visibility map page. + * old_cur points to current potision on old page. old_blkend + * points to end of old block. old_break points to old page + * break position for rewritin a new page. After wrote a new + * page, old_break proceeds rewriteVmBytesPerPage bytes. + */ + old_cur = buffer + SizeOfPageHeaderData; + old_blkend = buffer + bytesRead; + old_break = old_cur + rewriteVmBytesPerPage; + + while (old_blkend >= old_break) + { + char vmbuf[BLCKSZ]; + char *new_cur = vmbuf; + + /* Copy page header in advance */ + memcpy(vmbuf, &pageheader, SizeOfPageHeaderData); + + new_cur += SizeOfPageHeaderData; + + /* + * Process old page bytes one by one, and turn it + * into new page. + */ + while (old_break > old_cur) + { + uint16 new_vmbits = 0; + int i; + + /* Generate new format bits while keeping old information */ + for (i = 0; i < BITS_PER_BYTE; i++) + { + if ((((uint8) *old_cur) & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))) + new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i); + } + + /* Copy new visibility map bit to new format page */ + memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK); + + old_cur += BITS_PER_HEAPBLOCK_OLD; + new_cur += BITS_PER_HEAPBLOCK; + } + + /* Set new checksum for a visibility map page, If enabled */ + if (old_cluster.controldata.data_checksum_version != 0 && + new_cluster.controldata.data_checksum_version != 0) + ((PageHeader) vmbuf)->pd_checksum = pg_checksum_page(vmbuf, blkno); + + if (write(dst_fd, vmbuf, BLCKSZ) != BLCKSZ) + { + close(dst_fd); + close(src_fd); + return getErrorText(); + } + + old_break += rewriteVmBytesPerPage; + blkno++; + } + } + + /* Close files */ + close(dst_fd); + close(src_fd); + + /* + * After rewrote visibility map, we must check the visibility map bits of both files. + */ + if(!checkRewriteVisibilityMap(fromfile, tofile)) + return "failed to rewrite visibility map"; + + return NULL; + +} + +/* + * checkRewriteVisibilityMap() + * + * To ensure that rewriting visiblity map has been done successfully, + * this function compares the visibility map bits between oldfile and newfile. + */ +bool +checkRewriteVisibilityMap(const char *oldfile, const char *newfile) +{ + int old_fd = 0; + int new_fd = 0; + char old_buffer[BLCKSZ]; + char new_buffer[BLCKSZ]; + bool ret = true; + + if ((old_fd = open(oldfile, O_RDONLY, 0)) < 0) + return false; + + if ((new_fd = open(newfile, O_RDONLY, 0)) < 0) + { + close(old_fd); + return false; + } + + /* + * Since new visibility map format size is greater than old format, + * we read old format page at first. + */ + while ((read(old_fd, old_buffer, BLCKSZ)) == BLCKSZ) + { + int i; + char *old_cur; + + /* Skip page header area */ + old_cur = old_buffer + SizeOfPageHeaderData; + + /* + * New format visibility map size is larger than old format + * as (BITS_PER_HEAPBLOCK / BITS_PER_HEAPBLOCK_OLD) times. + */ + for (i = 0; i < (BITS_PER_HEAPBLOCK / BITS_PER_HEAPBLOCK_OLD); i++) + { + int j; + char *new_cur; + + if ((read(new_fd, new_buffer, BLCKSZ)) != BLCKSZ) + { + ret = false; + goto err; + } + + /* SKip page header area */ + new_cur = new_buffer + SizeOfPageHeaderData; + + while ((new_buffer + BLCKSZ) > new_cur) + { + uint8 old_visiblebits = 0; + uint8 new_visiblebits = 0; + uint16 new_vmbits = *(uint16 *) new_cur; + + /* + * Transform new format bits to old format bits while + * checking all-frozen bit is not set. + */ + for (j = 0; j < (BITS_PER_BYTE * BITS_PER_HEAPBLOCK); j++) + { + /* all-frozen bit (second bit) must not be set on new format */ + if (new_vmbits & (2 << (BITS_PER_HEAPBLOCK * j))) + { + ret = false; + goto err; + } + + if (new_vmbits & (1 << (BITS_PER_HEAPBLOCK * j))) + new_visiblebits |= 1 << (BITS_PER_HEAPBLOCK_OLD * j); + } + + old_visiblebits = (uint8) *old_cur; + + /* + * Compare visible bits between from old page and from new page. + * These all-visible bits (first bit) must be same. + */ + if (old_visiblebits != new_visiblebits) + { + ret = false; + goto err; + } + + old_cur += BITS_PER_HEAPBLOCK_OLD; + new_cur += BITS_PER_HEAPBLOCK; + } + } + } + +err: + close(new_fd); + close(old_fd); + + return ret; + +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 6122878..89beb73 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -110,6 +110,10 @@ extern char *output_files[]; #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031 /* + * The format of visibility map is changed with this 9.6 commit, + */ +#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011 +/* * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85, * ("Improve concurrency of foreign key locking") which also updated catalog * version to this value. pg_upgrade behavior depends on whether old and new @@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir); const char *copyFile(const char *src, const char *dst, bool force); const char *linkFile(const char *src, const char *dst); +const char *rewriteVisibilityMap(const char *fromfile, const char *tofile, + bool force); void check_hard_link(void); FILE *fopen_priv(const char *path, const char *mode); diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index b20f073..9daef0b 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -16,7 +16,7 @@ static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *suffix); +static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit); /* @@ -132,6 +132,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) { int mapnum; bool vm_crashsafe_match = true; + bool vm_must_add_frozenbit = false; /* * Do the old and new cluster disagree on the crash-safetiness of the vm @@ -141,13 +142,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER) vm_crashsafe_match = false; + /* + * Do we need to rewrite visibilitymap? + */ + if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER && + new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) + vm_must_add_frozenbit = true; + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { /* transfer primary file */ - transfer_relfile(&maps[mapnum], ""); + transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); /* fsm/vm files added in PG 8.4 */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804) @@ -155,9 +163,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) /* * Copy/link any fsm and vm files, if they exist */ - transfer_relfile(&maps[mapnum], "_fsm"); + transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); if (vm_crashsafe_match) - transfer_relfile(&maps[mapnum], "_vm"); + transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); } } } @@ -168,9 +176,11 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) * transfer_relfile() * * Copy or link file from old cluster to new one. + * if vm_must_add_frozenbti is true, each visibility map pages are written while + * adding frozen bit, even link mode. */ static void -transfer_relfile(FileNameMap *map, const char *type_suffix) +transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) { const char *msg; char old_file[MAXPGPATH]; @@ -232,7 +242,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = copyFile(old_file, new_file, true)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = copyFile(old_file, new_file, true); + + if (msg) pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); } @@ -240,7 +256,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix) { pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file); - if ((msg = linkFile(old_file, new_file)) != NULL) + /* Rewrite visibility map if needed */ + if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0)) + msg = rewriteVisibilityMap(old_file, new_file, true); + else + msg = linkFile(old_file, new_file); + + if (msg) pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", map->nspname, map->relname, old_file, new_file, msg); }