From 3140bbb4fa49c5d6f14f99b9910dea3e7fb0d970 Mon Sep 17 00:00:00 2001 From: Maxim Orlov Date: Fri, 11 Mar 2022 11:37:29 +0300 Subject: [PATCH v47 8/8] Use 64-bit XIDs - change TransactionId to 64bit - disk tuple format (HeapTupleHeader) is (almost) unchanged: xmin and xmax remains 32bit -- now 32bit xid is named ShortTransactionId - heap page format is changed to contain xid and multixact base value, tuple's xmin and xmax are offsets from. -- xid_base and multi_base are stored as a page special data. PageHeader remains unmodified. - in-memory tuple (HeapTuple) were enriched with precalulated 64bit xmin/xmax. Authors: - Alexander Korotkov - Teodor Sigaev - Nikita Glukhov - Maxim Orlov - Pavel Borisov - Yura Sokolov - Aleksander Alekseev Discussion: https://postgr.es/m/CACG%3DezZe1NQSCnfHOr78AtAZxJZeCvxrts0ygrxYwe%3DpyyjVWA%40mail.gmail.com Discussion: https://postgr.es/m/CAJ7c6TPDOYBYrnCAeyndkBktO0WG2xSdYduTF0nxq%2BvfkmTF5Q%40mail.gmail.com --- contrib/amcheck/verify_heapam.c | 79 +- contrib/amcheck/verify_nbtree.c | 2 +- contrib/hstore/hstore_io.c | 2 + contrib/pageinspect/Makefile | 3 +- contrib/pageinspect/btreefuncs.c | 16 +- contrib/pageinspect/expected/btree.out | 4 +- contrib/pageinspect/expected/hash_1.out | 166 +++ .../pageinspect/expected/oldextversions.out | 10 +- contrib/pageinspect/expected/page.out | 28 +- contrib/pageinspect/heapfuncs.c | 9 +- contrib/pageinspect/meson.build | 1 + .../pageinspect/pageinspect--1.10--1.11.sql | 145 ++ contrib/pageinspect/pageinspect--1.5.sql | 2 + contrib/pageinspect/pageinspect.control | 2 +- contrib/pageinspect/rawpage.c | 35 +- contrib/pageinspect/sql/btree.sql | 3 +- contrib/pg_surgery/heap_surgery.c | 22 +- .../pg_visibility/expected/pg_visibility.out | 17 + contrib/pg_visibility/pg_visibility.c | 7 +- contrib/pg_visibility/sql/pg_visibility.sql | 18 + contrib/pgrowlocks/pgrowlocks.c | 2 +- contrib/pgstattuple/pgstatapprox.c | 2 + contrib/pgstattuple/pgstatindex.c | 2 +- .../postgres_fdw/expected/postgres_fdw.out | 55 +- contrib/postgres_fdw/postgres_fdw.c | 9 +- contrib/postgres_fdw/sql/postgres_fdw.sql | 15 +- src/backend/access/common/heaptuple.c | 8 +- src/backend/access/common/reloptions.c | 118 +- src/backend/access/hash/hashvalidate.c | 5 +- src/backend/access/heap/heapam.c | 1256 +++++++++++++++-- src/backend/access/heap/heapam_handler.c | 53 +- src/backend/access/heap/heapam_visibility.c | 173 +-- src/backend/access/heap/heaptoast.c | 3 + src/backend/access/heap/hio.c | 46 +- src/backend/access/heap/pruneheap.c | 92 +- src/backend/access/heap/rewriteheap.c | 101 +- src/backend/access/heap/vacuumlazy.c | 150 +- src/backend/access/nbtree/nbtpage.c | 2 + src/backend/access/nbtree/nbtsplitloc.c | 16 +- src/backend/access/nbtree/nbtxlog.c | 2 + src/backend/access/rmgrdesc/gistdesc.c | 4 +- src/backend/access/rmgrdesc/heapdesc.c | 32 + src/backend/access/rmgrdesc/mxactdesc.c | 9 +- src/backend/access/rmgrdesc/nbtdesc.c | 4 +- src/backend/access/rmgrdesc/xactdesc.c | 6 +- src/backend/access/rmgrdesc/xlogdesc.c | 6 +- src/backend/access/transam/clog.c | 24 +- src/backend/access/transam/commit_ts.c | 19 - src/backend/access/transam/multixact.c | 686 +-------- src/backend/access/transam/slru.c | 13 +- src/backend/access/transam/subtrans.c | 9 +- src/backend/access/transam/transam.c | 18 +- src/backend/access/transam/twophase.c | 15 +- src/backend/access/transam/varsup.c | 161 +-- src/backend/access/transam/xact.c | 36 +- src/backend/access/transam/xlog.c | 10 +- src/backend/access/transam/xloginsert.c | 7 + src/backend/access/transam/xlogreader.c | 34 - src/backend/access/transam/xlogrecovery.c | 2 +- src/backend/bootstrap/bootstrap.c | 26 +- src/backend/catalog/heap.c | 8 +- src/backend/catalog/pg_inherits.c | 2 +- src/backend/commands/async.c | 2 +- src/backend/commands/dbcommands.c | 9 +- src/backend/commands/indexcmds.c | 6 +- src/backend/commands/sequence.c | 25 +- src/backend/commands/vacuum.c | 45 +- src/backend/executor/execExprInterp.c | 1 + src/backend/executor/execUtils.c | 1 + src/backend/executor/nodeModifyTable.c | 1 + src/backend/executor/spi.c | 1 + src/backend/nodes/gen_node_support.pl | 6 +- src/backend/nodes/outfuncs.c | 1 - src/backend/optimizer/util/plancat.c | 2 +- src/backend/postmaster/autovacuum.c | 72 +- src/backend/replication/logical/decode.c | 18 +- src/backend/replication/logical/proto.c | 50 +- .../replication/logical/reorderbuffer.c | 17 +- src/backend/replication/logical/snapbuild.c | 9 +- src/backend/replication/logical/worker.c | 2 +- src/backend/replication/pgoutput/pgoutput.c | 3 +- src/backend/replication/walreceiver.c | 28 +- src/backend/replication/walsender.c | 73 +- src/backend/statistics/extended_stats.c | 1 + src/backend/storage/buffer/Makefile | 3 +- src/backend/storage/buffer/bufmgr.c | 134 +- src/backend/storage/buffer/heap_convert.c | 546 +++++++ src/backend/storage/buffer/meson.build | 1 + src/backend/storage/ipc/procarray.c | 183 +-- src/backend/storage/ipc/sinvaladt.c | 4 +- src/backend/storage/ipc/standby.c | 6 +- src/backend/storage/lmgr/lmgr.c | 16 +- src/backend/storage/lmgr/lock.c | 4 +- src/backend/storage/lmgr/predicate.c | 6 +- src/backend/storage/lmgr/proc.c | 11 +- src/backend/storage/page/bufpage.c | 254 +++- src/backend/tcop/postgres.c | 25 +- src/backend/utils/adt/enum.c | 2 +- src/backend/utils/adt/jsonfuncs.c | 1 + src/backend/utils/adt/lockfuncs.c | 9 +- src/backend/utils/adt/pgstatfuncs.c | 1 + src/backend/utils/adt/rowtypes.c | 12 + src/backend/utils/adt/xid.c | 37 +- src/backend/utils/adt/xid8funcs.c | 83 +- src/backend/utils/cache/catcache.c | 1 + src/backend/utils/cache/relcache.c | 3 +- src/backend/utils/fmgr/fmgr.c | 4 +- src/backend/utils/misc/guc.c | 11 + src/backend/utils/misc/guc_tables.c | 180 +-- src/backend/utils/misc/help_config.c | 8 +- src/backend/utils/misc/pg_controldata.c | 2 +- src/backend/utils/misc/postgresql.conf.sample | 4 +- src/backend/utils/sort/tuplesortvariants.c | 14 +- src/backend/utils/time/combocid.c | 18 +- src/backend/utils/time/snapmgr.c | 25 +- src/bin/initdb/initdb.c | 60 +- src/bin/initdb/t/001_initdb.pl | 12 +- src/bin/pg_amcheck/t/004_verify_heapam.pl | 210 ++- src/bin/pg_controldata/pg_controldata.c | 2 +- src/bin/pg_dump/pg_dump.c | 27 +- src/bin/pg_dump/pg_dump.h | 8 +- src/bin/pg_resetwal/pg_resetwal.c | 56 +- src/bin/pg_upgrade/Makefile | 1 + src/bin/pg_upgrade/check.c | 149 +- src/bin/pg_upgrade/controldata.c | 17 +- src/bin/pg_upgrade/file.c | 99 +- src/bin/pg_upgrade/meson.build | 1 + src/bin/pg_upgrade/pg_upgrade.c | 145 +- src/bin/pg_upgrade/pg_upgrade.h | 34 +- src/bin/pg_upgrade/relfilenumber.c | 34 +- src/bin/pg_upgrade/segresize.c | 586 ++++++++ src/bin/pg_upgrade/t/002_pg_upgrade.pl | 20 +- src/bin/pg_upgrade/version.c | 104 +- src/bin/pg_verifybackup/t/003_corruption.pl | 2 +- src/bin/pg_waldump/pg_waldump.c | 2 +- src/include/access/clog.h | 2 +- src/include/access/ginblock.h | 11 +- src/include/access/gist.h | 2 +- src/include/access/heapam.h | 20 +- src/include/access/heapam_xlog.h | 38 +- src/include/access/heaptoast.h | 11 +- src/include/access/htup.h | 18 +- src/include/access/htup_details.h | 237 +++- src/include/access/multixact.h | 11 +- src/include/access/nbtree.h | 10 + src/include/access/rewriteheap.h | 4 +- src/include/access/rmgrlist.h | 1 + src/include/access/slru.h | 10 +- src/include/access/tableam.h | 2 +- src/include/access/transam.h | 89 +- src/include/access/tupmacs.h | 3 +- src/include/access/xact.h | 13 +- src/include/access/xloginsert.h | 1 + src/include/access/xlogreader.h | 4 - src/include/access/xlogrecord.h | 5 +- src/include/c.h | 27 +- src/include/catalog/catversion.h | 3 +- src/include/catalog/pg_amproc.dat | 4 +- src/include/catalog/pg_control.h | 6 + src/include/catalog/pg_operator.dat | 8 +- src/include/catalog/pg_proc.dat | 12 +- src/include/catalog/pg_type.dat | 4 +- src/include/catalog/pg_type.h | 5 + src/include/commands/vacuum.h | 30 +- src/include/fmgr.h | 2 + src/include/nodes/pg_list.h | 4 + src/include/pg_config.h.in | 3 + src/include/port/pg_lfind.h | 163 ++- src/include/postgres.h | 9 +- src/include/postmaster/autovacuum.h | 4 +- src/include/storage/buf_internals.h | 5 +- src/include/storage/bufmgr.h | 6 + src/include/storage/bufpage.h | 232 ++- src/include/storage/itemid.h | 2 + src/include/storage/lock.h | 14 +- src/include/storage/proc.h | 7 +- src/include/storage/standby.h | 2 +- src/include/utils/combocid.h | 2 +- src/include/utils/rel.h | 12 +- src/include/utils/xid8.h | 4 +- src/pl/plperl/plperl.c | 4 +- src/pl/plpgsql/src/pl_comp.c | 4 +- src/pl/plpgsql/src/pl_exec.c | 2 + src/pl/plpython/plpy_procedure.c | 4 +- src/pl/tcl/pltcl.c | 4 +- src/test/Makefile | 3 +- src/test/meson.build | 1 + src/test/modules/test_lfind/test_lfind.c | 30 +- src/test/perl/PostgreSQL/Test/Cluster.pm | 4 +- src/test/recovery/t/003_recovery_targets.pl | 2 +- src/test/regress/expected/indirect_toast.out | 8 + src/test/regress/expected/insert.out | 16 +- src/test/regress/expected/opr_sanity.out | 6 +- src/test/regress/expected/select_views.out | 70 +- src/test/regress/expected/txid.out | 8 +- src/test/regress/expected/type_sanity.out | 5 +- src/test/regress/expected/xid.out | 14 +- src/test/regress/expected/xid64.out | 92 ++ src/test/regress/parallel_schedule | 2 +- src/test/regress/pg_regress.c | 2 +- src/test/regress/regress.c | 291 ++++ src/test/regress/sql/indirect_toast.sql | 11 + src/test/regress/sql/insert.sql | 17 +- src/test/regress/sql/select_views.sql | 2 +- src/test/regress/sql/type_sanity.sql | 5 +- src/test/regress/sql/xid64.sql | 84 ++ src/test/xid-64/Makefile | 22 + src/test/xid-64/README | 16 + src/test/xid-64/meson.build | 15 + src/test/xid-64/t/001_test_large_xids.pl | 54 + src/test/xid-64/t/002_test_gucs.pl | 79 ++ src/test/xid-64/t/003_test_integrity.pl | 58 + src/test/xid-64/t/004_test_relminmxid.pl | 90 ++ src/test/xid-64/t/005_stream_subxact.pl | 100 ++ src/test/xid-64/t/006_zeropage.pl | 33 + src/tools/msvc/Solution.pm | 1 + src/tools/pgindent/typedefs.list | 4 +- 217 files changed, 6981 insertions(+), 2606 deletions(-) create mode 100644 contrib/pageinspect/expected/hash_1.out create mode 100644 contrib/pageinspect/pageinspect--1.10--1.11.sql create mode 100644 src/backend/storage/buffer/heap_convert.c create mode 100644 src/bin/pg_upgrade/segresize.c create mode 100644 src/test/regress/expected/xid64.out create mode 100644 src/test/regress/sql/xid64.sql create mode 100644 src/test/xid-64/Makefile create mode 100644 src/test/xid-64/README create mode 100644 src/test/xid-64/meson.build create mode 100644 src/test/xid-64/t/001_test_large_xids.pl create mode 100644 src/test/xid-64/t/002_test_gucs.pl create mode 100644 src/test/xid-64/t/003_test_integrity.pl create mode 100644 src/test/xid-64/t/004_test_relminmxid.pl create mode 100644 src/test/xid-64/t/005_stream_subxact.pl create mode 100644 src/test/xid-64/t/006_zeropage.pl diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index c9e71e4e50..63e043e6dd 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -17,6 +17,7 @@ #include "access/multixact.h" #include "access/toast_internals.h" #include "access/visibilitymap.h" +#include "catalog/catalog.h" #include "catalog/pg_am.h" #include "funcapi.h" #include "miscadmin.h" @@ -85,7 +86,7 @@ typedef struct HeapCheckContext * from them. */ FullTransactionId next_fxid; /* ShmemVariableCache->nextXid */ - TransactionId next_xid; /* 32-bit version of next_fxid */ + TransactionId next_xid; /* 64-bit version of next_fxid */ TransactionId oldest_xid; /* ShmemVariableCache->oldestXid */ FullTransactionId oldest_fxid; /* 64-bit version of oldest_xid, computed * relative to next_fxid */ @@ -126,6 +127,7 @@ typedef struct HeapCheckContext uint16 lp_len; uint16 lp_off; HeapTupleHeader tuphdr; + HeapTupleData tuple; int natts; /* Values for iterating over attributes within the tuple */ @@ -165,8 +167,6 @@ static bool check_tuple_visibility(HeapCheckContext *ctx); static void report_corruption(HeapCheckContext *ctx, char *msg); static void report_toast_corruption(HeapCheckContext *ctx, ToastedAttribute *ta, char *msg); -static FullTransactionId FullTransactionIdFromXidAndCtx(TransactionId xid, - const HeapCheckContext *ctx); static void update_cached_xid_range(HeapCheckContext *ctx); static void update_cached_mxid_range(HeapCheckContext *ctx); static XidBoundsViolation check_mxid_in_range(MultiXactId mxid, @@ -390,7 +390,7 @@ verify_heapam(PG_FUNCTION_ARGS) update_cached_xid_range(&ctx); update_cached_mxid_range(&ctx); ctx.relfrozenxid = ctx.rel->rd_rel->relfrozenxid; - ctx.relfrozenfxid = FullTransactionIdFromXidAndCtx(ctx.relfrozenxid, &ctx); + ctx.relfrozenfxid = FullTransactionIdFromXid(ctx.relfrozenxid); ctx.relminmxid = ctx.rel->rd_rel->relminmxid; if (TransactionIdIsNormal(ctx.relfrozenxid)) @@ -505,6 +505,12 @@ verify_heapam(PG_FUNCTION_ARGS) ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, ctx.itemid); ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr); + ctx.tuple.t_data = ctx.tuphdr; + ctx.tuple.t_len = ItemIdGetLength(ctx.itemid); + ctx.tuple.t_tableOid = RelationGetRelid(ctx.rel); + HeapTupleCopyBaseFromPage(ctx.buffer, &ctx.tuple, ctx.page, + IsToastRelation(ctx.rel)); + /* Ok, ready to check this next tuple */ check_tuple(&ctx); } @@ -728,12 +734,13 @@ check_tuple_visibility(HeapCheckContext *ctx) XidCommitStatus xmin_status; XidCommitStatus xvac_status; XidCommitStatus xmax_status; + HeapTuple tuple = &ctx->tuple; HeapTupleHeader tuphdr = ctx->tuphdr; ctx->tuple_could_be_pruned = true; /* have not yet proven otherwise */ /* If xmin is normal, it should be within valid range */ - xmin = HeapTupleHeaderGetXmin(tuphdr); + xmin = HeapTupleGetXmin(tuple); switch (get_xid_status(xmin, ctx, &xmin_status)) { case XID_INVALID: @@ -743,19 +750,19 @@ check_tuple_visibility(HeapCheckContext *ctx) report_corruption(ctx, psprintf("xmin %llu equals or exceeds next valid transaction ID %llu", (unsigned long long) xmin, - (unsigned long long) U64FromFullTransactionId(ctx->next_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->next_fxid))); return false; case XID_PRECEDES_CLUSTERMIN: report_corruption(ctx, psprintf("xmin %llu precedes oldest valid transaction ID %llu", (unsigned long long) xmin, - (unsigned long long) U64FromFullTransactionId(ctx->oldest_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->oldest_fxid))); return false; case XID_PRECEDES_RELMIN: report_corruption(ctx, psprintf("xmin %llu precedes relation freeze threshold %llu", (unsigned long long) xmin, - (unsigned long long) U64FromFullTransactionId(ctx->relfrozenfxid))); + (unsigned long long) XidFromFullTransactionId(ctx->relfrozenfxid))); return false; } @@ -781,19 +788,19 @@ check_tuple_visibility(HeapCheckContext *ctx) report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved off tuple equals or exceeds next valid transaction ID %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->next_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->next_fxid))); return false; case XID_PRECEDES_RELMIN: report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved off tuple precedes relation freeze threshold %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->relfrozenfxid))); + (unsigned long long) XidFromFullTransactionId(ctx->relfrozenfxid))); return false; case XID_PRECEDES_CLUSTERMIN: report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved off tuple precedes oldest valid transaction ID %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->oldest_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->oldest_fxid))); return false; case XID_BOUNDS_OK: break; @@ -847,19 +854,19 @@ check_tuple_visibility(HeapCheckContext *ctx) report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved in tuple equals or exceeds next valid transaction ID %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->next_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->next_fxid))); return false; case XID_PRECEDES_RELMIN: report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved in tuple precedes relation freeze threshold %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->relfrozenfxid))); + (unsigned long long) XidFromFullTransactionId(ctx->relfrozenfxid))); return false; case XID_PRECEDES_CLUSTERMIN: report_corruption(ctx, psprintf("old-style VACUUM FULL transaction ID %llu for moved in tuple precedes oldest valid transaction ID %llu", (unsigned long long) xvac, - (unsigned long long) U64FromFullTransactionId(ctx->oldest_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->oldest_fxid))); return false; case XID_BOUNDS_OK: break; @@ -936,7 +943,7 @@ check_tuple_visibility(HeapCheckContext *ctx) * HEAP_XMAX_IS_LOCKED_ONLY is true, but for now we err on the side of * avoiding possibly-bogus complaints about missing TOAST entries. */ - xmax = HeapTupleHeaderGetRawXmax(tuphdr); + xmax = HeapTupleGetRawXmax(tuple); switch (check_mxid_valid_in_rel(xmax, ctx)) { case XID_INVALID: @@ -995,7 +1002,7 @@ check_tuple_visibility(HeapCheckContext *ctx) * We already checked above that this multixact is within limits for * this table. Now check the update xid from this multixact. */ - xmax = HeapTupleGetUpdateXid(tuphdr); + xmax = HeapTupleGetUpdateXid(tuple); switch (get_xid_status(xmax, ctx, &xmax_status)) { case XID_INVALID: @@ -1007,19 +1014,19 @@ check_tuple_visibility(HeapCheckContext *ctx) report_corruption(ctx, psprintf("update xid %llu equals or exceeds next valid transaction ID %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->next_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->next_fxid))); return true; case XID_PRECEDES_RELMIN: report_corruption(ctx, psprintf("update xid %llu precedes relation freeze threshold %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->relfrozenfxid))); + (unsigned long long) XidFromFullTransactionId(ctx->relfrozenfxid))); return true; case XID_PRECEDES_CLUSTERMIN: report_corruption(ctx, psprintf("update xid %llu precedes oldest valid transaction ID %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->oldest_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->oldest_fxid))); return true; case XID_BOUNDS_OK: break; @@ -1059,26 +1066,26 @@ check_tuple_visibility(HeapCheckContext *ctx) } /* xmax is an XID, not a MXID. Sanity check it. */ - xmax = HeapTupleHeaderGetRawXmax(tuphdr); + xmax = HeapTupleGetRawXmax(tuple); switch (get_xid_status(xmax, ctx, &xmax_status)) { case XID_IN_FUTURE: report_corruption(ctx, psprintf("xmax %llu equals or exceeds next valid transaction ID %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->next_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->next_fxid))); return false; /* corrupt */ case XID_PRECEDES_RELMIN: report_corruption(ctx, psprintf("xmax %llu precedes relation freeze threshold %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->relfrozenfxid))); + (unsigned long long) XidFromFullTransactionId(ctx->relfrozenfxid))); return false; /* corrupt */ case XID_PRECEDES_CLUSTERMIN: report_corruption(ctx, psprintf("xmax %llu precedes oldest valid transaction ID %llu", (unsigned long long) xmax, - (unsigned long long) U64FromFullTransactionId(ctx->oldest_fxid))); + (unsigned long long) XidFromFullTransactionId(ctx->oldest_fxid))); return false; /* corrupt */ case XID_BOUNDS_OK: case XID_INVALID: @@ -1553,24 +1560,6 @@ check_tuple(HeapCheckContext *ctx) ctx->attnum = -1; } -/* - * Convert a TransactionId into a FullTransactionId using our cached values of - * the valid transaction ID range. It is the caller's responsibility to have - * already updated the cached values, if necessary. - */ -static FullTransactionId -FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx) -{ - uint32 epoch; - - if (!TransactionIdIsNormal(xid)) - return FullTransactionIdFromEpochAndXid(0, xid); - epoch = EpochFromFullTransactionId(ctx->next_fxid); - if (xid > ctx->next_xid) - epoch--; - return FullTransactionIdFromEpochAndXid(epoch, xid); -} - /* * Update our cached range of valid transaction IDs. */ @@ -1584,7 +1573,7 @@ update_cached_xid_range(HeapCheckContext *ctx) LWLockRelease(XidGenLock); /* And compute alternate versions of the same */ - ctx->oldest_fxid = FullTransactionIdFromXidAndCtx(ctx->oldest_xid, ctx); + ctx->oldest_fxid = FullTransactionIdFromXid(ctx->oldest_xid); ctx->next_xid = XidFromFullTransactionId(ctx->next_fxid); } @@ -1684,7 +1673,7 @@ get_xid_status(TransactionId xid, HeapCheckContext *ctx, } /* Check if the xid is within bounds */ - fxid = FullTransactionIdFromXidAndCtx(xid, ctx); + fxid = FullTransactionIdFromXid(xid); if (!fxid_in_cached_range(fxid, ctx)) { /* @@ -1693,7 +1682,6 @@ get_xid_status(TransactionId xid, HeapCheckContext *ctx, * performed the full xid conversion, reconvert. */ update_cached_xid_range(ctx); - fxid = FullTransactionIdFromXidAndCtx(xid, ctx); } if (FullTransactionIdPrecedesOrEquals(ctx->next_fxid, fxid)) @@ -1717,8 +1705,7 @@ get_xid_status(TransactionId xid, HeapCheckContext *ctx, *status = XID_COMMITTED; LWLockAcquire(XactTruncationLock, LW_SHARED); clog_horizon = - FullTransactionIdFromXidAndCtx(ShmemVariableCache->oldestClogXid, - ctx); + FullTransactionIdFromXid(ShmemVariableCache->oldestClogXid); if (FullTransactionIdPrecedesOrEquals(clog_horizon, fxid)) { if (TransactionIdIsCurrentTransactionId(xid)) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 9021d156eb..d2720124d7 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -526,7 +526,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, * avoid this. */ if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + !TransactionIdPrecedes(HeapTupleGetXmin(rel->rd_indextuple), snapshot->xmin)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 6161df2790..ed4dcbe9a3 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -853,6 +853,7 @@ hstore_from_record(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = rec; + HeapTupleSetZeroBase(&tuple); values = (Datum *) palloc(ncolumns * sizeof(Datum)); nulls = (bool *) palloc(ncolumns * sizeof(bool)); @@ -1006,6 +1007,7 @@ hstore_populate_record(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = rec; + HeapTupleSetZeroBase(&tuple); } /* diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index 5c0736564a..5ca80c9d76 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -13,7 +13,8 @@ OBJS = \ rawpage.o EXTENSION = pageinspect -DATA = pageinspect--1.9--1.10.sql pageinspect--1.8--1.9.sql \ +DATA = pageinspect--1.10--1.11.sql \ + pageinspect--1.9--1.10.sql pageinspect--1.8--1.9.sql \ pageinspect--1.7--1.8.sql pageinspect--1.6--1.7.sql \ pageinspect--1.5.sql pageinspect--1.5--1.6.sql \ pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \ diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index b18aa0af7f..7b3a57dcf5 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -105,6 +105,9 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) stat->page_size = PageGetPageSize(page); + stat->btpo_prev = opaque->btpo_prev; + stat->btpo_level = opaque->btpo_level; + /* page type (flags) */ if (P_ISDELETED(opaque)) { @@ -126,11 +129,18 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) FullTransactionId safexid = BTPageGetDeleteXid(page); elog(DEBUG2, "deleted page from block %u has safexid %llu", - blkno, (unsigned long long) U64FromFullTransactionId(safexid)); + blkno, (unsigned long long) XidFromFullTransactionId(safexid)); } else + { + ShortTransactionId safexid = BTP_GET_XACT(opaque); + + stat->btpo_prev = 0; + stat->btpo_level = 0; + elog(DEBUG2, "deleted page from block %u has safexid %u", - blkno, opaque->btpo_level); + blkno, safexid); + } /* Don't interpret BTDeletedPageData as index tuples */ maxoff = InvalidOffsetNumber; @@ -145,9 +155,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) stat->type = 'i'; /* btpage opaque data */ - stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; - stat->btpo_level = opaque->btpo_level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 035a81a759..5fb9122466 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -94,8 +94,8 @@ SELECT bt_page_items('aaa'::bytea); ERROR: invalid page size -- invalid special area size CREATE INDEX test1_a_brin ON test1 USING brin(a); -SELECT bt_page_items(get_raw_page('test1', 0)); -ERROR: input page is not a valid btree page +-- XXX: false positive in 64xids due to equal sizes of BTPageOpaque and HeapPageSpecialData +-- SELECT bt_page_items(get_raw_page('test1', 0)); SELECT bt_page_items(get_raw_page('test1_a_brin', 0)); ERROR: input page is not a valid btree page \set VERBOSITY default diff --git a/contrib/pageinspect/expected/hash_1.out b/contrib/pageinspect/expected/hash_1.out new file mode 100644 index 0000000000..5e64eb9260 --- /dev/null +++ b/contrib/pageinspect/expected/hash_1.out @@ -0,0 +1,166 @@ +CREATE TABLE test_hash (a int, b text); +INSERT INTO test_hash VALUES (1, 'one'); +CREATE INDEX test_hash_a_idx ON test_hash USING hash (a); +\x +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 0)); +-[ RECORD 1 ]--+--------- +hash_page_type | metapage + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 1)); +-[ RECORD 1 ]--+------- +hash_page_type | bucket + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 2)); +-[ RECORD 1 ]--+------- +hash_page_type | bucket + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 3)); +-[ RECORD 1 ]--+------- +hash_page_type | bucket + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 4)); +-[ RECORD 1 ]--+------- +hash_page_type | bucket + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 5)); +-[ RECORD 1 ]--+------- +hash_page_type | bitmap + +SELECT hash_page_type(get_raw_page('test_hash_a_idx', 6)); +ERROR: block number 6 is out of range for relation "test_hash_a_idx" +SELECT * FROM hash_bitmap_info('test_hash_a_idx', -1); +ERROR: invalid block number +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 0); +ERROR: invalid overflow block number 0 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 1); +ERROR: invalid overflow block number 1 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 2); +ERROR: invalid overflow block number 2 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 3); +ERROR: invalid overflow block number 3 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 4); +ERROR: invalid overflow block number 4 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 5); +ERROR: invalid overflow block number 5 +SELECT * FROM hash_bitmap_info('test_hash_a_idx', 6); +ERROR: block number 6 is out of range for relation "test_hash_a_idx" +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 0)); +-[ RECORD 1 ]-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +magic | 105121344 +version | 4 +ntuples | 1 +bsize | 8156 +bmsize | 4096 +bmshift | 15 +maxbucket | 3 +highmask | 7 +lowmask | 3 +ovflpoint | 2 +firstfree | 0 +nmaps | 1 +procid | 450 +spares | {0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} +mapp | {5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} + +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 1)); +ERROR: page is not a hash meta page +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 2)); +ERROR: page is not a hash meta page +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 3)); +ERROR: page is not a hash meta page +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 4)); +ERROR: page is not a hash meta page +SELECT magic, version, ntuples, bsize, bmsize, bmshift, maxbucket, highmask, +lowmask, ovflpoint, firstfree, nmaps, procid, spares, mapp FROM +hash_metapage_info(get_raw_page('test_hash_a_idx', 5)); +ERROR: page is not a hash meta page +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 0)); +ERROR: page is not a hash bucket or overflow page +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 1)); +-[ RECORD 1 ]---+----------- +live_items | 0 +dead_items | 0 +page_size | 8192 +hasho_prevblkno | 3 +hasho_nextblkno | 4294967295 +hasho_bucket | 0 +hasho_flag | 2 +hasho_page_id | 65408 + +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 2)); +-[ RECORD 1 ]---+----------- +live_items | 0 +dead_items | 0 +page_size | 8192 +hasho_prevblkno | 3 +hasho_nextblkno | 4294967295 +hasho_bucket | 1 +hasho_flag | 2 +hasho_page_id | 65408 + +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 3)); +-[ RECORD 1 ]---+----------- +live_items | 1 +dead_items | 0 +page_size | 8192 +hasho_prevblkno | 3 +hasho_nextblkno | 4294967295 +hasho_bucket | 2 +hasho_flag | 2 +hasho_page_id | 65408 + +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 4)); +-[ RECORD 1 ]---+----------- +live_items | 0 +dead_items | 0 +page_size | 8192 +hasho_prevblkno | 3 +hasho_nextblkno | 4294967295 +hasho_bucket | 3 +hasho_flag | 2 +hasho_page_id | 65408 + +SELECT live_items, dead_items, page_size, hasho_prevblkno, hasho_nextblkno, +hasho_bucket, hasho_flag, hasho_page_id FROM +hash_page_stats(get_raw_page('test_hash_a_idx', 5)); +ERROR: page is not a hash bucket or overflow page +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 0)); +ERROR: page is not a hash bucket or overflow page +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 1)); +(0 rows) + +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 2)); +(0 rows) + +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 3)); +-[ RECORD 1 ]---------- +itemoffset | 1 +ctid | (0,1) +data | 2389907270 + +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 4)); +(0 rows) + +SELECT * FROM hash_page_items(get_raw_page('test_hash_a_idx', 5)); +ERROR: page is not a hash bucket or overflow page +DROP TABLE test_hash; diff --git a/contrib/pageinspect/expected/oldextversions.out b/contrib/pageinspect/expected/oldextversions.out index f5c4b61bd7..00323d392d 100644 --- a/contrib/pageinspect/expected/oldextversions.out +++ b/contrib/pageinspect/expected/oldextversions.out @@ -40,16 +40,16 @@ SELECT * FROM bt_page_items('test1_a_idx', 1); -- pagesize in pageinspect >= 1.10. ALTER EXTENSION pageinspect UPDATE TO '1.9'; \df page_header - List of functions - Schema | Name | Result data type | Argument data types | Type ---------+-------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------ - public | page_header | record | page bytea, OUT lsn pg_lsn, OUT checksum smallint, OUT flags smallint, OUT lower smallint, OUT upper smallint, OUT special smallint, OUT pagesize smallint, OUT version smallint, OUT prune_xid xid | func + List of functions + Schema | Name | Result data type | Argument data types | Type +--------+-------------+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------ + public | page_header | record | page bytea, OUT lsn pg_lsn, OUT checksum smallint, OUT flags smallint, OUT lower smallint, OUT upper smallint, OUT special smallint, OUT pagesize smallint, OUT version smallint, OUT xid_base xid, OUT multi_base xid, OUT prune_xid xid | func (1 row) SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); pagesize | version ----------+--------- - 8192 | 4 + 8192 | 5 (1 row) DROP TABLE test1; diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3bdc37bbf5..5ca00378df 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -48,7 +48,7 @@ SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); pagesize | version ----------+--------- - 8192 | 4 + 8192 | 5 (1 row) SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_test; @@ -69,19 +69,19 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); fsm_page_contents ------------------- - 0: 254 + - 1: 254 + - 3: 254 + - 7: 254 + - 15: 254 + - 31: 254 + - 63: 254 + - 127: 254 + - 255: 254 + - 511: 254 + - 1023: 254 + - 2047: 254 + - 4095: 254 + + 0: 253 + + 1: 253 + + 3: 253 + + 7: 253 + + 15: 253 + + 31: 253 + + 63: 253 + + 127: 253 + + 255: 253 + + 511: 253 + + 1023: 253 + + 2047: 253 + + 4095: 253 + fp_next_slot: 0 + (1 row) diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index aed2753253..c65f7099b7 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -163,7 +163,7 @@ heap_page_items(PG_FUNCTION_ARGS) inter_call_data->tupd = tupdesc; inter_call_data->offset = FirstOffsetNumber; - inter_call_data->page = VARDATA(raw_page); + inter_call_data->page = get_page_from_raw(raw_page); fctx->max_calls = PageGetMaxOffsetNumber(inter_call_data->page); fctx->user_fctx = inter_call_data; @@ -211,6 +211,7 @@ heap_page_items(PG_FUNCTION_ARGS) lp_offset == MAXALIGN(lp_offset) && lp_offset + lp_len <= raw_page_size) { + HeapTupleData tup; HeapTupleHeader tuphdr; bytea *tuple_data_bytea; int tuple_data_len; @@ -218,9 +219,11 @@ heap_page_items(PG_FUNCTION_ARGS) /* Extract information from the tuple header */ tuphdr = (HeapTupleHeader) PageGetItem(page, id); + tup.t_data = tuphdr; + HeapTupleCopyBaseFromPage(InvalidBuffer, &tup, page, false); - values[4] = UInt32GetDatum(HeapTupleHeaderGetRawXmin(tuphdr)); - values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr)); + values[4] = TransactionIdGetDatum(HeapTupleGetXmin(&tup)); + values[5] = TransactionIdGetDatum(HeapTupleGetRawXmax(&tup)); /* shared with xvac */ values[6] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr)); values[7] = PointerGetDatum(&tuphdr->t_ctid); diff --git a/contrib/pageinspect/meson.build b/contrib/pageinspect/meson.build index 4af8153e4f..8af29aa7fb 100644 --- a/contrib/pageinspect/meson.build +++ b/contrib/pageinspect/meson.build @@ -27,6 +27,7 @@ install_data( 'pageinspect--1.7--1.8.sql', 'pageinspect--1.8--1.9.sql', 'pageinspect--1.9--1.10.sql', + 'pageinspect--1.10--1.11.sql', 'pageinspect.control', kwargs: contrib_data_args, ) diff --git a/contrib/pageinspect/pageinspect--1.10--1.11.sql b/contrib/pageinspect/pageinspect--1.10--1.11.sql new file mode 100644 index 0000000000..236f18aa2f --- /dev/null +++ b/contrib/pageinspect/pageinspect--1.10--1.11.sql @@ -0,0 +1,145 @@ +/* contrib/pageinspect/pageinspect--1.10--1.11.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.11'" to load this file. \quit + +-- +-- gist_page_opaque_info() +-- +DROP FUNCTION gist_page_opaque_info(bytea); +CREATE FUNCTION gist_page_opaque_info(IN page bytea, + OUT lsn pg_lsn, + OUT nsn pg_lsn, + OUT rightlink bigint, + OUT flags text[]) +AS 'MODULE_PATHNAME', 'gist_page_opaque_info' +LANGUAGE C STRICT PARALLEL SAFE; + + +-- +-- gist_page_items_bytea() +-- +DROP FUNCTION gist_page_items_bytea(bytea); +CREATE FUNCTION gist_page_items_bytea(IN page bytea, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT dead boolean, + OUT key_data bytea) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'gist_page_items_bytea' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- gist_page_items() +-- +DROP FUNCTION gist_page_items(bytea, regclass); +CREATE FUNCTION gist_page_items(IN page bytea, + IN index_oid regclass, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT dead boolean, + OUT keys text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'gist_page_items' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- get_raw_page() +-- +DROP FUNCTION get_raw_page(text, int8); +DROP FUNCTION IF EXISTS get_raw_page(text, int4); +CREATE FUNCTION get_raw_page(text, int8) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_1_9' +LANGUAGE C STRICT PARALLEL SAFE; + +DROP FUNCTION get_raw_page(text, text, int8); +DROP FUNCTION IF EXISTS get_raw_page(text, text, int4); +CREATE FUNCTION get_raw_page(text, text, int8) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_fork_1_9' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- page_checksum() +-- +DROP FUNCTION page_checksum(IN page bytea, IN blkno int8); +DROP FUNCTION IF EXISTS page_checksum(IN page bytea, IN blkno int4); +CREATE FUNCTION page_checksum(IN page bytea, IN blkno int8) +RETURNS smallint +AS 'MODULE_PATHNAME', 'page_checksum_1_9' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(text); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int8, + OUT level int8, + OUT fastroot int8, + OUT fastlevel int8, + OUT last_cleanup_num_delpages int8, + OUT last_cleanup_num_tuples float8, + OUT allequalimage boolean) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_stats() +-- +DROP FUNCTION bt_page_stats(text, int8); +DROP FUNCTION IF EXISTS bt_page_stats(text, int4); +CREATE FUNCTION bt_page_stats(IN relname text, IN blkno int8, + OUT blkno int8, + OUT type "char", + OUT live_items int4, + OUT dead_items int4, + OUT avg_item_size int4, + OUT page_size int4, + OUT free_size int4, + OUT btpo_prev int8, + OUT btpo_next int8, + OUT btpo_level int8, + OUT btpo_flags int4) +AS 'MODULE_PATHNAME', 'bt_page_stats_1_9' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_items() +-- +DROP FUNCTION bt_page_items(text, int8); +DROP FUNCTION IF EXISTS bt_page_items(text, int4); +CREATE FUNCTION bt_page_items(IN relname text, IN blkno int8, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT nulls bool, + OUT vars bool, + OUT data text, + OUT dead boolean, + OUT htid tid, + OUT tids tid[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'bt_page_items_1_9' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- brin_page_items() +-- +DROP FUNCTION brin_page_items(IN page bytea, IN index_oid regclass); +CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass, + OUT itemoffset int, + OUT blknum int8, + OUT attnum int, + OUT allnulls bool, + OUT hasnulls bool, + OUT placeholder bool, + OUT value text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'brin_page_items' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/pageinspect/pageinspect--1.5.sql b/contrib/pageinspect/pageinspect--1.5.sql index 1e40c3c97e..fdbd2995a2 100644 --- a/contrib/pageinspect/pageinspect--1.5.sql +++ b/contrib/pageinspect/pageinspect--1.5.sql @@ -28,6 +28,8 @@ CREATE FUNCTION page_header(IN page bytea, OUT special smallint, OUT pagesize smallint, OUT version smallint, + OUT xid_base xid, + OUT multi_base xid, OUT prune_xid xid) AS 'MODULE_PATHNAME', 'page_header' LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/pageinspect/pageinspect.control b/contrib/pageinspect/pageinspect.control index 7cdf37913d..f277413dd8 100644 --- a/contrib/pageinspect/pageinspect.control +++ b/contrib/pageinspect/pageinspect.control @@ -1,5 +1,5 @@ # pageinspect extension comment = 'inspect the contents of database pages at a low level' -default_version = '1.10' +default_version = '1.11' module_pathname = '$libdir/pageinspect' relocatable = true diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index 90942be71e..6d50940955 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/relation.h" +#include "commands/sequence.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" #include "funcapi.h" @@ -251,8 +252,9 @@ page_header(PG_FUNCTION_ARGS) Datum result; HeapTuple tuple; - Datum values[9]; - bool nulls[9]; + Datum values[11]; + bool nulls[11]; + bool is_toast; Page page; PageHeader pageheader; @@ -314,12 +316,37 @@ page_header(PG_FUNCTION_ARGS) } values[7] = UInt16GetDatum(PageGetPageLayoutVersion(page)); - values[8] = TransactionIdGetDatum(pageheader->pd_prune_xid); + is_toast = PageGetSpecialSize(page) == + MAXALIGN(sizeof(ToastPageSpecialData)); + values[8] = TransactionIdGetDatum(HeapPageGetPruneXidNoAssert((Page) page, + is_toast)); /* Build and return the tuple. */ - memset(nulls, 0, sizeof(nulls)); + if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HeapPageSpecialData))) + { + /* Heap page */ + HeapPageSpecial pageSpecial = HeapPageGetSpecial((Page) page); + + values[9] = TransactionIdGetDatum(pageSpecial->pd_xid_base); + values[10] = TransactionIdGetDatum(pageSpecial->pd_multi_base); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(ToastPageSpecialData))) + { + /* TOAST page */ + ToastPageSpecial pageSpecial = ToastPageGetSpecial((Page) page); + + values[9] = TransactionIdGetDatum(pageSpecial->pd_xid_base); + nulls[10] = true; + } + else + { + /* Double xmax page */ + nulls[9] = true; + nulls[10] = true; + } + tuple = heap_form_tuple(tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); diff --git a/contrib/pageinspect/sql/btree.sql b/contrib/pageinspect/sql/btree.sql index 1f554f0f67..538d71d23a 100644 --- a/contrib/pageinspect/sql/btree.sql +++ b/contrib/pageinspect/sql/btree.sql @@ -40,7 +40,8 @@ SELECT bt_page_items(get_raw_page('test1_b_gist', 0)); SELECT bt_page_items('aaa'::bytea); -- invalid special area size CREATE INDEX test1_a_brin ON test1 USING brin(a); -SELECT bt_page_items(get_raw_page('test1', 0)); +-- XXX: false positive in 64xids due to equal sizes of BTPageOpaque and HeapPageSpecialData +-- SELECT bt_page_items(get_raw_page('test1', 0)); SELECT bt_page_items(get_raw_page('test1_a_brin', 0)); \set VERBOSITY default diff --git a/contrib/pg_surgery/heap_surgery.c b/contrib/pg_surgery/heap_surgery.c index 8a2ad9773d..439ebaf8e2 100644 --- a/contrib/pg_surgery/heap_surgery.c +++ b/contrib/pg_surgery/heap_surgery.c @@ -15,6 +15,7 @@ #include "access/heapam.h" #include "access/visibilitymap.h" #include "access/xloginsert.h" +#include "catalog/catalog.h" #include "catalog/pg_am_d.h" #include "catalog/pg_proc_d.h" #include "miscadmin.h" @@ -272,11 +273,18 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) else { HeapTupleHeader htup; + HeapTupleData tuple; Assert(heap_force_opt == HEAP_FORCE_FREEZE); htup = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_data = htup; + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + HeapTupleCopyBaseFromPage(buf, &tuple, page, + IsToastRelation(rel)); + /* * Reset all visibility-related fields of the tuple. This * logic should mimic heap_execute_freeze_tuple(), but we @@ -284,8 +292,18 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) * potentially-garbled data is left behind. */ ItemPointerSet(&htup->t_ctid, blkno, curoff); - HeapTupleHeaderSetXmin(htup, FrozenTransactionId); - HeapTupleHeaderSetXmax(htup, InvalidTransactionId); + if (IsToastRelation(rel)) + { + ToastTupleHeaderSetXmin(page, &tuple); + ToastTupleHeaderSetXmax(page, &tuple); + } + else + { + HeapTupleHeaderSetXmin(page, &tuple); + HeapTupleHeaderSetXmax(page, &tuple); + } + HeapTupleSetXmin(&tuple, FrozenTransactionId); + HeapTupleSetXmax(&tuple, InvalidTransactionId); if (htup->t_infomask & HEAP_MOVED) { if (htup->t_infomask & HEAP_MOVED_OFF) diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out index 9de54db2a2..d3c893b4e3 100644 --- a/contrib/pg_visibility/expected/pg_visibility.out +++ b/contrib/pg_visibility/expected/pg_visibility.out @@ -267,6 +267,22 @@ select * from pg_check_frozen('copyfreeze'); -------- (0 rows) +create table vacuum_test as select 42 i; +vacuum (disable_page_skipping) vacuum_test; +-- pg_check_visible() can report false positive due to autovacuum activity. +-- To workaround this issue, repeat the call. +do $$ +declare + non_visible_count bigint; + i integer; +begin + for i in 1 .. 10 loop + if i > 1 then perform pg_sleep(1); end if; + select count(*) from pg_check_visible('vacuum_test') into non_visible_count; + if non_visible_count = 0 then exit; end if; + end loop; + if non_visible_count > 0 then raise exception 'The visibility map is corrupt.'; end if; +end $$; -- cleanup drop table test_partitioned; drop view test_view; @@ -277,3 +293,4 @@ drop foreign data wrapper dummy; drop materialized view matview_visibility_test; drop table regular_table; drop table copyfreeze; +drop table vacuum_test; diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index a95f73ec79..d2296c2d02 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -14,6 +14,7 @@ #include "access/htup_details.h" #include "access/visibilitymap.h" #include "access/xloginsert.h" +#include "catalog/catalog.h" #include "catalog/pg_type.h" #include "catalog/storage_xlog.h" #include "funcapi.h" @@ -652,6 +653,8 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = relid; + HeapTupleCopyBaseFromPage(buffer, &tuple, page, + IsToastRelation(rel)); /* * If we're checking whether the page is all-visible, we expect @@ -695,7 +698,7 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) */ if (check_frozen) { - if (heap_tuple_needs_eventual_freeze(tuple.t_data)) + if (heap_tuple_needs_eventual_freeze(&tuple)) record_corrupt_item(items, &tuple.t_self); } } @@ -758,7 +761,7 @@ tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer) * be set here. So just check the xmin. */ - xmin = HeapTupleHeaderGetXmin(tup->t_data); + xmin = HeapTupleGetXmin(tup); if (!TransactionIdPrecedes(xmin, OldestXmin)) return false; /* xmin not old enough for all to see */ diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql index ff3538f996..a0d9525df9 100644 --- a/contrib/pg_visibility/sql/pg_visibility.sql +++ b/contrib/pg_visibility/sql/pg_visibility.sql @@ -170,6 +170,23 @@ commit; select * from pg_visibility_map('copyfreeze'); select * from pg_check_frozen('copyfreeze'); +create table vacuum_test as select 42 i; +vacuum (disable_page_skipping) vacuum_test; +-- pg_check_visible() can report false positive due to autovacuum activity. +-- To workaround this issue, repeat the call. +do $$ +declare + non_visible_count bigint; + i integer; +begin + for i in 1 .. 10 loop + if i > 1 then perform pg_sleep(1); end if; + select count(*) from pg_check_visible('vacuum_test') into non_visible_count; + if non_visible_count = 0 then exit; end if; + end loop; + if non_visible_count > 0 then raise exception 'The visibility map is corrupt.'; end if; +end $$; + -- cleanup drop table test_partitioned; drop view test_view; @@ -180,3 +197,4 @@ drop foreign data wrapper dummy; drop materialized view matview_visibility_test; drop table regular_table; drop table copyfreeze; +drop table vacuum_test; diff --git a/contrib/pgrowlocks/pgrowlocks.c b/contrib/pgrowlocks/pgrowlocks.c index ef89b84ec3..0abf1ea21a 100644 --- a/contrib/pgrowlocks/pgrowlocks.c +++ b/contrib/pgrowlocks/pgrowlocks.c @@ -130,7 +130,7 @@ pgrowlocks(PG_FUNCTION_ARGS) htsu = HeapTupleSatisfiesUpdate(tuple, GetCurrentCommandId(false), hscan->rs_cbuf); - xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); + xmax = HeapTupleGetRawXmax(tuple); infomask = tuple->t_data->t_infomask; /* diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index b827728326..5bf73251b7 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -19,6 +19,7 @@ #include "access/transam.h" #include "access/visibilitymap.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "catalog/namespace.h" #include "catalog/pg_am_d.h" #include "commands/vacuum.h" @@ -153,6 +154,7 @@ statapprox_heap(Relation rel, output_type *stat) tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); + HeapTupleCopyBaseFromPage(buf, &tuple, page, IsToastRelation(rel)); /* * We follow VACUUM's lead in counting INSERT_IN_PROGRESS tuples diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index d69ac1c93d..4d045b2764 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -605,7 +605,7 @@ pgstathashindex(PG_FUNCTION_ARGS) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); stats.version = metap->hashm_version; - stats.space_per_page = metap->hashm_bsize; + stats.space_per_page = BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(HashPageOpaqueData)); _hash_relbuf(rel, metabuf); /* Get the current relation length */ diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index cc9e39c4a5..69bd2c929f 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -4599,16 +4599,24 @@ UPDATE ft2 SET c2 = c2 + 300, c3 = c3 || '_update3' WHERE c1 % 10 = 3; UPDATE ft2 SET c2 = c2 + 300, c3 = c3 || '_update3' WHERE c1 % 10 = 3; EXPLAIN (verbose, costs off) -UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *; -- can be pushed down - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------------------------- - Update on public.ft2 - Output: c1, c2, c3, c4, c5, c6, c7, c8 - -> Foreign Update on public.ft2 - Remote SQL: UPDATE "S 1"."T 1" SET c2 = (c2 + 400), c3 = (c3 || '_update7') WHERE ((("C 1" % 10) = 7)) RETURNING "C 1", c2, c3, c4, c5, c6, c7, c8 -(4 rows) +WITH t AS (UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *) +SELECT * FROM t ORDER BY c1; -- can be pushed down + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Sort + Output: t.c1, t.c2, t.c3, t.c4, t.c5, t.c6, t.c7, t.c8 + Sort Key: t.c1 + CTE t + -> Update on public.ft2 + Output: ft2.c1, ft2.c2, ft2.c3, ft2.c4, ft2.c5, ft2.c6, ft2.c7, ft2.c8 + -> Foreign Update on public.ft2 + Remote SQL: UPDATE "S 1"."T 1" SET c2 = (c2 + 400), c3 = (c3 || '_update7') WHERE ((("C 1" % 10) = 7)) RETURNING "C 1", c2, c3, c4, c5, c6, c7, c8 + -> CTE Scan on t + Output: t.c1, t.c2, t.c3, t.c4, t.c5, t.c6, t.c7, t.c8 +(10 rows) -UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *; +WITH t AS (UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *) +SELECT * FROM t ORDER BY c1; c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 ------+-----+--------------------+------------------------------+--------------------------+----+------------+----- 7 | 407 | 00007_update7 | Thu Jan 08 00:00:00 1970 PST | Thu Jan 08 00:00:00 1970 | 7 | 7 | foo @@ -4728,16 +4736,24 @@ UPDATE ft2 SET c2 = ft2.c2 + 500, c3 = ft2.c3 || '_update9', c7 = DEFAULT UPDATE ft2 SET c2 = ft2.c2 + 500, c3 = ft2.c3 || '_update9', c7 = DEFAULT FROM ft1 WHERE ft1.c1 = ft2.c2 AND ft1.c1 % 10 = 9; EXPLAIN (verbose, costs off) - DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4; -- can be pushed down - QUERY PLAN --------------------------------------------------------------------------------------------- - Delete on public.ft2 - Output: c1, c4 - -> Foreign Delete on public.ft2 - Remote SQL: DELETE FROM "S 1"."T 1" WHERE ((("C 1" % 10) = 5)) RETURNING "C 1", c4 -(4 rows) + WITH t AS (DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4) + SELECT * FROM t ORDER BY c1; -- can be pushed down + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Sort + Output: t.c1, t.c4 + Sort Key: t.c1 + CTE t + -> Delete on public.ft2 + Output: ft2.c1, ft2.c4 + -> Foreign Delete on public.ft2 + Remote SQL: DELETE FROM "S 1"."T 1" WHERE ((("C 1" % 10) = 5)) RETURNING "C 1", c4 + -> CTE Scan on t + Output: t.c1, t.c4 +(10 rows) -DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4; +WITH t AS (DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4) +SELECT * FROM t ORDER BY c1; c1 | c4 ------+------------------------------ 5 | Tue Jan 06 00:00:00 1970 PST @@ -5998,7 +6014,8 @@ INSERT INTO ft2 (c1,c2,c3,c6) VALUES (1218, 818, 'ggg', '(--;') RETURNING *; 1218 | 818 | ggg_trig_update | | | (--; | ft2 | (1 row) -UPDATE ft2 SET c2 = c2 + 600 WHERE c1 % 10 = 8 AND c1 < 1200 RETURNING *; +WITH t AS (UPDATE ft2 SET c2 = c2 + 600 WHERE c1 % 10 = 8 AND c1 < 1200 RETURNING *) +SELECT * FROM t ORDER BY c1; c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8 ------+-----+------------------------+------------------------------+--------------------------+----+------------+----- 8 | 608 | 00008_trig_update | Fri Jan 09 00:00:00 1970 PST | Fri Jan 09 00:00:00 1970 | 8 | 8 | foo diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index dd858aba03..b54fbeccca 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -4814,8 +4814,8 @@ apply_returning_filter(PgFdwDirectModifyState *dmstate, * Note: no need to care about tableoid here because it will be * initialized in ExecProcessReturning(). */ - HeapTupleHeaderSetXmin(resultTup->t_data, InvalidTransactionId); - HeapTupleHeaderSetXmax(resultTup->t_data, InvalidTransactionId); + HeapTupleSetXmin(resultTup, InvalidTransactionId); + HeapTupleSetXmax(resultTup, InvalidTransactionId); HeapTupleHeaderSetCmin(resultTup->t_data, InvalidTransactionId); } @@ -7328,6 +7328,7 @@ make_tuple_from_result_row(PGresult *res, */ if (ctid) tuple->t_self = tuple->t_data->t_ctid = *ctid; + HeapTupleSetZeroBase(tuple); /* * Stomp on the xmin, xmax, and cmin fields from the tuple created by @@ -7337,8 +7338,8 @@ make_tuple_from_result_row(PGresult *res, * assumption. If we don't do this then, for example, the tuple length * ends up in the xmin field, which isn't what we want. */ - HeapTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId); - HeapTupleHeaderSetXmin(tuple->t_data, InvalidTransactionId); + HeapTupleSetXmax(tuple, InvalidTransactionId); + HeapTupleSetXmin(tuple, InvalidTransactionId); HeapTupleHeaderSetCmin(tuple->t_data, InvalidTransactionId); /* Clean up */ diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index e48ccd286b..b2f035b50f 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -1285,16 +1285,20 @@ EXPLAIN (verbose, costs off) UPDATE ft2 SET c2 = c2 + 300, c3 = c3 || '_update3' WHERE c1 % 10 = 3; -- can be pushed down UPDATE ft2 SET c2 = c2 + 300, c3 = c3 || '_update3' WHERE c1 % 10 = 3; EXPLAIN (verbose, costs off) -UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *; -- can be pushed down -UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *; +WITH t AS (UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *) +SELECT * FROM t ORDER BY c1; -- can be pushed down +WITH t AS (UPDATE ft2 SET c2 = c2 + 400, c3 = c3 || '_update7' WHERE c1 % 10 = 7 RETURNING *) +SELECT * FROM t ORDER BY c1; EXPLAIN (verbose, costs off) UPDATE ft2 SET c2 = ft2.c2 + 500, c3 = ft2.c3 || '_update9', c7 = DEFAULT FROM ft1 WHERE ft1.c1 = ft2.c2 AND ft1.c1 % 10 = 9; -- can be pushed down UPDATE ft2 SET c2 = ft2.c2 + 500, c3 = ft2.c3 || '_update9', c7 = DEFAULT FROM ft1 WHERE ft1.c1 = ft2.c2 AND ft1.c1 % 10 = 9; EXPLAIN (verbose, costs off) - DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4; -- can be pushed down -DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4; + WITH t AS (DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4) + SELECT * FROM t ORDER BY c1; -- can be pushed down +WITH t AS (DELETE FROM ft2 WHERE c1 % 10 = 5 RETURNING c1, c4) +SELECT * FROM t ORDER BY c1; EXPLAIN (verbose, costs off) DELETE FROM ft2 USING ft1 WHERE ft1.c1 = ft2.c2 AND ft1.c1 % 10 = 2; -- can be pushed down DELETE FROM ft2 USING ft1 WHERE ft1.c1 = ft2.c2 AND ft1.c1 % 10 = 2; @@ -1401,7 +1405,8 @@ CREATE TRIGGER t1_br_insert BEFORE INSERT OR UPDATE INSERT INTO ft2 (c1,c2,c3) VALUES (1208, 818, 'fff') RETURNING *; INSERT INTO ft2 (c1,c2,c3,c6) VALUES (1218, 818, 'ggg', '(--;') RETURNING *; -UPDATE ft2 SET c2 = c2 + 600 WHERE c1 % 10 = 8 AND c1 < 1200 RETURNING *; +WITH t AS (UPDATE ft2 SET c2 = c2 + 600 WHERE c1 % 10 = 8 AND c1 < 1200 RETURNING *) +SELECT * FROM t ORDER BY c1; -- Test errors thrown on remote side during update ALTER TABLE "S 1"."T 1" ADD CONSTRAINT c2positive CHECK (c2 >= 0); diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 7e355585a0..9ce239bbbf 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -640,10 +640,10 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) result = PointerGetDatum(&(tup->t_self)); break; case MinTransactionIdAttributeNumber: - result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmin(tup->t_data)); + result = TransactionIdGetDatum(HeapTupleGetRawXmin(tup)); break; case MaxTransactionIdAttributeNumber: - result = TransactionIdGetDatum(HeapTupleHeaderGetRawXmax(tup->t_data)); + result = TransactionIdGetDatum(HeapTupleGetRawXmax(tup)); break; case MinCommandIdAttributeNumber: case MaxCommandIdAttributeNumber: @@ -688,6 +688,7 @@ heap_copytuple(HeapTuple tuple) newTuple->t_len = tuple->t_len; newTuple->t_self = tuple->t_self; newTuple->t_tableOid = tuple->t_tableOid; + HeapTupleCopyBase(newTuple, tuple); newTuple->t_data = (HeapTupleHeader) ((char *) newTuple + HEAPTUPLESIZE); memcpy((char *) newTuple->t_data, (char *) tuple->t_data, tuple->t_len); return newTuple; @@ -714,6 +715,7 @@ heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest) dest->t_len = src->t_len; dest->t_self = src->t_self; dest->t_tableOid = src->t_tableOid; + HeapTupleCopyBase(dest, src); dest->t_data = (HeapTupleHeader) palloc(src->t_len); memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len); } @@ -1161,6 +1163,7 @@ heap_modify_tuple(HeapTuple tuple, newTuple->t_data->t_ctid = tuple->t_data->t_ctid; newTuple->t_self = tuple->t_self; newTuple->t_tableOid = tuple->t_tableOid; + HeapTupleCopyBase(newTuple, tuple); return newTuple; } @@ -1224,6 +1227,7 @@ heap_modify_tuple_by_cols(HeapTuple tuple, newTuple->t_data->t_ctid = tuple->t_data->t_ctid; newTuple->t_self = tuple->t_self; newTuple->t_tableOid = tuple->t_tableOid; + HeapTupleCopyBase(newTuple, tuple); return newTuple; } diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index b0993f37d4..1da7f572c3 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -260,58 +260,6 @@ static relopt_int intRelOpts[] = }, -1, 1, 10000 }, - { - { - "autovacuum_freeze_min_age", - "Minimum age at which VACUUM should freeze a table row, for autovacuum", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, - -1, 0, 1000000000 - }, - { - { - "autovacuum_multixact_freeze_min_age", - "Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, - -1, 0, 1000000000 - }, - { - { - "autovacuum_freeze_max_age", - "Age at which to autovacuum a table to prevent transaction ID wraparound", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, - -1, 100000, 2000000000 - }, - { - { - "autovacuum_multixact_freeze_max_age", - "Multixact age at which to autovacuum a table to prevent multixact wraparound", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, - -1, 10000, 2000000000 - }, - { - { - "autovacuum_freeze_table_age", - "Age at which VACUUM should perform a full table sweep to freeze row versions", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, -1, 0, 2000000000 - }, - { - { - "autovacuum_multixact_freeze_table_age", - "Age of multixact at which VACUUM should perform a full table sweep to freeze row versions", - RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, - ShareUpdateExclusiveLock - }, -1, 0, 2000000000 - }, { { "log_autovacuum_min_duration", @@ -388,6 +336,60 @@ static relopt_int intRelOpts[] = static relopt_int64 int64RelOpts[] = { + { + { + "autovacuum_freeze_min_age", + "Minimum age at which VACUUM should freeze a table row, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(0), INT64CONST(1000000000) + }, + { + { + "autovacuum_multixact_freeze_min_age", + "Minimum multixact age at which VACUUM should freeze a row multixact's, for autovacuum", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(0), INT64CONST(1000000000) + }, + { + { + "autovacuum_freeze_max_age", + "Age at which to autovacuum a table to prevent transaction ID wraparound", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(100000), INT64CONST(2000000000) + }, + { + { + "autovacuum_multixact_freeze_max_age", + "Multixact age at which to autovacuum a table to prevent multixact wraparound", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(10000), INT64CONST(2000000000) + }, + { + { + "autovacuum_freeze_table_age", + "Age at which VACUUM should perform a full table sweep to freeze row versions", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(0), INT64CONST(2000000000) + }, + { + { + "autovacuum_multixact_freeze_table_age", + "Age of multixact at which VACUUM should perform a full table sweep to freeze row versions", + RELOPT_KIND_HEAP | RELOPT_KIND_TOAST, + ShareUpdateExclusiveLock + }, + INT64CONST(-1), INT64CONST(0), INT64CONST(2000000000) + }, /* list terminator */ {{NULL}} }; @@ -1920,17 +1922,17 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, analyze_threshold)}, {"autovacuum_vacuum_cost_limit", RELOPT_TYPE_INT, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, vacuum_cost_limit)}, - {"autovacuum_freeze_min_age", RELOPT_TYPE_INT, + {"autovacuum_freeze_min_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_min_age)}, - {"autovacuum_freeze_max_age", RELOPT_TYPE_INT, + {"autovacuum_freeze_max_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_max_age)}, - {"autovacuum_freeze_table_age", RELOPT_TYPE_INT, + {"autovacuum_freeze_table_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, freeze_table_age)}, - {"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT, + {"autovacuum_multixact_freeze_min_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_min_age)}, - {"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT, + {"autovacuum_multixact_freeze_max_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_max_age)}, - {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT, + {"autovacuum_multixact_freeze_table_age", RELOPT_TYPE_INT64, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, multixact_freeze_table_age)}, {"log_autovacuum_min_duration", RELOPT_TYPE_INT, offsetof(StdRdOptions, autovacuum) + offsetof(AutoVacOpts, log_min_duration)}, diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c index 10bf26ce7c..83e033b93c 100644 --- a/src/backend/access/hash/hashvalidate.c +++ b/src/backend/access/hash/hashvalidate.c @@ -317,11 +317,10 @@ check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype) * INTERNAL and allowing any such function seems too scary. */ if ((funcid == F_HASHINT4 || funcid == F_HASHINT4EXTENDED) && - (argtype == DATEOID || - argtype == XIDOID || argtype == CIDOID)) + (argtype == DATEOID || argtype == CIDOID)) /* okay, allowed use of hashint4() */ ; else if ((funcid == F_HASHINT8 || funcid == F_HASHINT8EXTENDED) && - (argtype == XID8OID)) + (argtype == XID8OID || argtype == XIDOID)) /* okay, allowed use of hashint8() */ ; else if ((funcid == F_TIMESTAMP_HASH || funcid == F_TIMESTAMP_HASH_EXTENDED) && diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 685200d154..5bc35a28d8 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -52,10 +52,14 @@ #include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/catalog.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "commands/vacuum.h" #include "miscadmin.h" #include "pgstat.h" #include "port/atomics.h" #include "port/pg_bitutils.h" +#include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -73,7 +77,7 @@ static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, - TransactionId xid, CommandId cid, int options); + CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, @@ -110,6 +114,8 @@ static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); +static bool heap_page_prepare_for_xid(Relation relation, Buffer buffer, + TransactionId xid, bool multi); /* @@ -460,6 +466,8 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); loctup.t_len = ItemIdGetLength(lpp); + HeapTupleCopyBaseFromPage(buffer, &loctup, dp, + IsToastRelation(scan->rs_base.rs_rd)); ItemPointerSet(&(loctup.t_self), page, lineoff); if (all_visible) @@ -676,6 +684,8 @@ heapgettup(HeapScanDesc scan, tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); + HeapTupleCopyBaseFromPage(scan->rs_cbuf, tuple, dp, + IsToastRelation(scan->rs_base.rs_rd)); return; } @@ -702,6 +712,8 @@ heapgettup(HeapScanDesc scan, tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); + HeapTupleCopyBaseFromPage(scan->rs_cbuf, tuple, dp, + IsToastRelation(scan->rs_base.rs_rd)); ItemPointerSet(&(tuple->t_self), page, lineoff); /* @@ -1001,6 +1013,8 @@ heapgettup_pagemode(HeapScanDesc scan, tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); + HeapTupleCopyBaseFromPage(InvalidBuffer, tuple, dp, + IsToastRelation(scan->rs_base.rs_rd)); /* check that rs_cindex is in sync */ Assert(scan->rs_cindex < scan->rs_ntuples); @@ -1023,6 +1037,8 @@ heapgettup_pagemode(HeapScanDesc scan, tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); + HeapTupleCopyBaseFromPage(InvalidBuffer, tuple, dp, + IsToastRelation(scan->rs_base.rs_rd)); ItemPointerSet(&(tuple->t_self), page, lineoff); /* @@ -1614,6 +1630,7 @@ heap_fetch(Relation relation, tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); + HeapTupleCopyBaseFromPage(buffer, tuple, page, IsToastRelation(relation)); /* * check tuple visibility, then release lock @@ -1622,7 +1639,7 @@ heap_fetch(Relation relation, if (valid) PredicateLockTID(relation, &(tuple->t_self), snapshot, - HeapTupleHeaderGetXmin(tuple->t_data)); + HeapTupleGetXmin(tuple)); HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); @@ -1699,6 +1716,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Assert(TransactionIdIsValid(RecentXmin)); Assert(BufferGetBlockNumber(buffer) == blkno); + heapTuple->t_self = *tid; + /* Scan through possible multiple members of HOT-chain */ for (;;) { @@ -1734,6 +1753,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); heapTuple->t_tableOid = RelationGetRelid(relation); + HeapTupleCopyBaseFromPage(buffer, heapTuple, dp, + IsToastRelation(relation)); ItemPointerSet(&heapTuple->t_self, blkno, offnum); /* @@ -1748,7 +1769,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, */ if (TransactionIdIsValid(prev_xmax) && !TransactionIdEquals(prev_xmax, - HeapTupleHeaderGetXmin(heapTuple->t_data))) + HeapTupleGetXmin(heapTuple))) break; /* @@ -1769,7 +1790,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, { ItemPointerSetOffsetNumber(tid, offnum); PredicateLockTID(relation, &heapTuple->t_self, snapshot, - HeapTupleHeaderGetXmin(heapTuple->t_data)); + HeapTupleGetXmin(heapTuple)); if (all_dead) *all_dead = false; return true; @@ -1804,7 +1825,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, blkno); offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); at_chain_start = false; - prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + prev_xmax = HeapTupleGetUpdateXidAny(heapTuple); } else break; /* end of chain */ @@ -1891,13 +1912,14 @@ heap_get_latest_tid(TableScanDesc sscan, tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); tp.t_len = ItemIdGetLength(lp); tp.t_tableOid = RelationGetRelid(relation); + HeapTupleCopyBaseFromPage(buffer, &tp, page, IsToastRelation(relation)); /* * After following a t_ctid link, we might arrive at an unrelated * tuple. Check for XMIN match. */ if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + !TransactionIdEquals(priorXmax, HeapTupleGetXmin(&tp))) { UnlockReleaseBuffer(buffer); break; @@ -1916,7 +1938,7 @@ heap_get_latest_tid(TableScanDesc sscan, * If there's a valid t_ctid link, follow it, else we're done. */ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || - HeapTupleHeaderIsOnlyLocked(tp.t_data) || + HeapTupleIsOnlyLocked(&tp) || HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) || ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) { @@ -1925,7 +1947,7 @@ heap_get_latest_tid(TableScanDesc sscan, } ctid = tp.t_data->t_ctid; - priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + priorXmax = HeapTupleGetUpdateXidAny(&tp); UnlockReleaseBuffer(buffer); } /* end of loop */ } @@ -1950,7 +1972,7 @@ heap_get_latest_tid(TableScanDesc sscan, static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) { - Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); + Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(BufferGetPage(buffer), tuple), xid)); Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) @@ -2042,7 +2064,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ - heaptup = heap_prepare_insert(relation, tup, xid, cid, options); + heaptup = heap_prepare_insert(relation, tup, cid, options); /* * Find buffer to insert this tuple into. If the page is all visible, @@ -2069,6 +2091,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, */ CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); + heap_page_prepare_for_xid(relation, buffer, xid, false); + HeapTupleSetXmin(heaptup, xid); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2144,12 +2169,23 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, { xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; bufflags |= REGBUF_KEEP_DATA; - - if (IsToastRelation(relation)) - xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; } + if (IsToastRelation(relation)) + xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; + XLogBeginInsert(); + + if (info & XLOG_HEAP_INIT_PAGE) + { + char *base; + + base = IsToastRelation(relation) ? + (char *) &ToastPageGetSpecial(page)->pd_xid_base : + (char *) &HeapPageGetSpecial(page)->pd_xid_base; + XLogRegisterData(base, sizeof(TransactionId)); + } + XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; @@ -2204,6 +2240,535 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } } +static void +xid_min_max(ShortTransactionId *min, ShortTransactionId *max, + ShortTransactionId xid, + bool *found) +{ + Assert(TransactionIdIsNormal(xid)); + Assert(xid <= MaxShortTransactionId); + + if (!*found) + { + *min = *max = xid; + *found = true; + } + else + { + *min = Min(*min, xid); + *max = Max(*max, xid); + } +} + +/* + * Find minimum and maximum short transaction ids which occurs in the page. + * + * Works for multi and non multi transaction. Which is defined by "multi" + * argument. + */ +static bool +heap_page_xid_min_max(Page page, bool multi, + ShortTransactionId *min, ShortTransactionId *max, + bool is_toast) +{ + bool found; + OffsetNumber offnum, + maxoff; + ItemId itemid; + HeapTupleHeader htup; + + maxoff = PageGetMaxOffsetNumber(page); + found = false; + + Assert(!multi || !is_toast); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsNormal(itemid)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + if (!multi) + { + /* + * For non multi transactions we should see inside the tuple for + * update transaction. + */ + Assert(!is_toast || !(htup->t_infomask & HEAP_XMAX_IS_MULTI)); + + if (TransactionIdIsNormal(htup->t_choice.t_heap.t_xmin) && + !HeapTupleHeaderXminFrozen(htup)) + { + xid_min_max(min, max, htup->t_choice.t_heap.t_xmin, &found); + } + + if ((htup->t_infomask & HEAP_XMAX_IS_MULTI) && + (!(htup->t_infomask & HEAP_XMAX_LOCK_ONLY))) + { + TransactionId update_xid; + ShortTransactionId xid; + + Assert(!is_toast); + update_xid = MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(page, htup), + htup->t_infomask); + xid = NormalTransactionIdToShort(HeapPageGetSpecial(page)->pd_xid_base, + update_xid); + + xid_min_max(min, max, xid, &found); + } + } + + if (!TransactionIdIsNormal(htup->t_choice.t_heap.t_xmax)) + continue; + + if (multi != ((htup->t_infomask & HEAP_XMAX_IS_MULTI) != 0)) + continue; + + xid_min_max(min, max, htup->t_choice.t_heap.t_xmax, &found); + } + + Assert(!found || (*min > InvalidTransactionId && *max <= MaxShortTransactionId)); + + return found; +} + +/* + * Shift xid base in the page. WAL-logged if buffer is specified. + */ +static void +heap_page_shift_base(Relation relation, Buffer buffer, Page page, + bool multi, int64 delta, bool is_toast) +{ + TransactionId *xid_base, + *multi_base; + OffsetNumber offnum, + maxoff; + ItemId itemid; + HeapTupleHeader htup; + + Assert(IsBufferLockedExclusive(buffer)); + + xid_base = multi_base = NULL; + + START_CRIT_SECTION(); + + if (is_toast) + { + Assert(!multi); + xid_base = &ToastPageGetSpecial(page)->pd_xid_base; + } + else + { + HeapPageSpecial special; + + special = HeapPageGetSpecial(page); + xid_base = &special->pd_xid_base; + multi_base = &special->pd_multi_base; + } + + /* Iterate over page items */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsNormal(itemid)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + /* Apply xid shift to heap tuple */ + if (!multi) + { + /* shift xmin */ + if (TransactionIdIsNormal(htup->t_choice.t_heap.t_xmin) && + !HeapTupleHeaderXminFrozen(htup)) + { + Assert(htup->t_choice.t_heap.t_xmin - delta >= FirstNormalTransactionId); + Assert(htup->t_choice.t_heap.t_xmin - delta <= MaxShortTransactionId); + htup->t_choice.t_heap.t_xmin -= delta; + } + } + + /* shift xmax */ + if (!TransactionIdIsNormal(htup->t_choice.t_heap.t_xmax)) + continue; + + if (multi != (bool) (htup->t_infomask & HEAP_XMAX_IS_MULTI)) + continue; + + Assert(htup->t_choice.t_heap.t_xmax - delta >= FirstNormalTransactionId); + Assert(htup->t_choice.t_heap.t_xmax - delta <= MaxShortTransactionId); + htup->t_choice.t_heap.t_xmax -= delta; + } + + /* Apply xid shift to base as well */ + if (!multi) + *xid_base += delta; + else + *multi_base += delta; + + if (BufferIsValid(buffer)) + MarkBufferDirty(buffer); + + /* Write WAL record if needed */ + if (relation && RelationNeedsWAL(relation)) + { + XLogRecPtr recptr; + xl_heap_base_shift xlrec; + + xlrec.delta = delta; + xlrec.multi = multi; + xlrec.flags = 0; + if (IsToastRelation(relation)) + xlrec.flags |= XLH_BASE_SHIFT_ON_TOAST_RELATION; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapBaseShift); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HEAP3_ID, XLOG_HEAP3_BASE_SHIFT); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + +/* + * Freeze xids in the single heap page. Useful when we can't fit new xid even + * with base shift. + */ +static void +freeze_single_heap_page(Relation relation, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + OffsetNumber offnum, + maxoff; + HeapTupleData tuple; + int nfrozen = 0; + xl_heap_freeze_tuple *frozen; + TransactionId OldestXmin, + FreezeXid; + MultiXactId OldestMxact, + MultiXactCutoff; + GlobalVisState *vistest; + ItemId itemid; + bool tuple_totally_frozen; + int ndeleted, + nnewlpdead; + + vacuum_set_xid_limits(relation, 0, 0, 0, 0, &OldestMxact, + &OldestXmin, &FreezeXid, &MultiXactCutoff); + + vistest = GlobalVisTestFor(relation); + + ndeleted = heap_page_prune(relation, buffer, vistest, InvalidTransactionId, 0, + &nnewlpdead, &offnum, false); + if (ndeleted > nnewlpdead) + pgstat_update_heap_dead_tuples(relation, + ndeleted - nnewlpdead); + + /* + * Now scan the page to collect vacuumable items and check for tuples + * requiring freezing. + */ + maxoff = PageGetMaxOffsetNumber(page); + frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); + + /* + * Note: If you change anything in the loop below, also look at + * heap_page_is_all_visible to see if that needs to be changed. + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + TransactionId NewRelfrozenXid; + MultiXactId NewRelminMxid; + + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsNormal(itemid)) + continue; + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(relation); + HeapTupleCopyBaseFromPage(buffer, &tuple, page, + IsToastRelation(relation)); + + /* + * Each non-removable tuple must be checked to see if it needs + * freezing. Note we already have exclusive buffer lock. + */ + if (heap_prepare_freeze_tuple(&tuple, + relation->rd_rel->relfrozenxid, + relation->rd_rel->relminmxid, + FreezeXid, MultiXactCutoff, + &frozen[nfrozen], &tuple_totally_frozen, + &NewRelfrozenXid, &NewRelminMxid)) + frozen[nfrozen++].offset = offnum; + } + + /* + * If we froze any tuples, mark the buffer dirty, and write a WAL record + * recording the changes. We must log the changes to be crash-safe + * against future truncation of CLOG. + */ + if (nfrozen > 0) + { + int i; + ItemId itemid; + HeapTupleHeader htup; + + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + heap_execute_freeze_tuple_page(page, htup, &frozen[i], + IsToastRelation(relation)); + } + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(relation)) + { + XLogRecPtr recptr; + + recptr = log_heap_freeze(relation, buffer, FreezeXid, + frozen, nfrozen); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + pfree(frozen); + + return; +} + +/* + * Check if xid still fits on a page with given base and delta. + */ +static inline bool +is_delta_fits_heap_page(TransactionId xid, TransactionId base, int64 delta) +{ + return xid >= base + delta + FirstNormalTransactionId && + xid <= base + delta + MaxShortTransactionId; +} + +/* + * Check if xid fits on a page with given base. + */ +static inline bool +is_xid_fits_heap_page(TransactionId xid, TransactionId base) +{ + return xid >= base + FirstNormalTransactionId && + xid <= base + MaxShortTransactionId; +} + +/* + * Check if delta fits on a page. + * + * If delta does not fits, never return. + */ +static void +heap_page_check_delta(Buffer buffer, + TransactionId xid, TransactionId base, + ShortTransactionId min, ShortTransactionId max, + int64 delta, int64 *freeDelta, int64 *requiredDelta) +{ + BufferDesc *buf; + char *path; + BackendId backend; + + Assert((freeDelta == NULL) == (requiredDelta == NULL)); + + /* + * If delta fits the page, we good to go ... + */ + if (is_delta_fits_heap_page(xid, base, delta)) + return; + + /* + * ... otherwise handle the error. + */ + if (buffer == InvalidBuffer) + return; + + if (BufferIsLocal(buffer)) + { + buf = GetLocalBufferDescriptor(-buffer - 1); + backend = MyBackendId; + } + else + { + buf = GetBufferDescriptor(buffer - 1); + backend = InvalidBackendId; + } + + path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend, + buf->tag.forkNum); + + if (freeDelta == NULL) + elog(FATAL, "Fatal xid base calculation error: xid = %llu, base = %llu, min = %u, max = %u, delta = %lld (rel=%s, blockNum=%u)", + (unsigned long long) xid, (unsigned long long) base, + min, max, + (long long) delta, + path, buf->tag.blockNum); + + elog(FATAL, "Fatal xid base calculation error: xid = %llu, base = %llu, min = %u, max = %u, freeDelta = %lld, requiredDelta = %lld, delta = %lld (rel=%s, blockNum=%u)", + (unsigned long long) xid, (unsigned long long) base, + min, max, + (long long) *freeDelta, (long long) *requiredDelta, + (long long) delta, + path, buf->tag.blockNum); +} + +/* + * Shift page base. + */ +static void +heap_page_apply_delta(Relation relation, Buffer buffer, Page page, + TransactionId xid, bool multi, + TransactionId base, int64 delta, bool is_toast) +{ + Assert(is_delta_fits_heap_page(xid, base, delta)); + + heap_page_shift_base(relation, buffer, page, multi, delta, is_toast); + +#ifdef USE_ASSERT_CHECKING + if (is_toast) + { + Assert(!multi); + base = ToastPageGetSpecial(page)->pd_xid_base; + } + else + base = multi ? HeapPageGetSpecial(page)->pd_multi_base : + HeapPageGetSpecial(page)->pd_xid_base; + + Assert(is_xid_fits_heap_page(xid, base)); +#endif /* USE_ASSERT_CHECKING */ +} + +/* + * Try to fit xid on a page. + */ +static int +heap_page_try_prepare_for_xid(Relation relation, Buffer buffer, Page page, + TransactionId xid, bool multi, bool is_toast) +{ + TransactionId base; + ShortTransactionId min = InvalidTransactionId, + max = InvalidTransactionId; + int64 delta, + freeDelta, + requiredDelta; + + if (is_toast) + { + Assert(!multi); + base = ToastPageGetSpecial(page)->pd_xid_base; + } + else + { + HeapPageSpecial special; + + special = HeapPageGetSpecial(page); + base = multi ? special->pd_multi_base : special->pd_xid_base; + } + + /* If xid fits the page no action needed. */ + if (is_xid_fits_heap_page(xid, base)) + return 0; + + /* No items on the page? */ + if (!heap_page_xid_min_max(page, multi, &min, &max, is_toast)) + { + delta = (int64) (xid - FirstNormalTransactionId) - (int64) base; + heap_page_check_delta(buffer, xid, base, min, max, delta, NULL, NULL); + heap_page_apply_delta(relation, buffer, page, xid, multi, base, delta, + is_toast); + return 0; + } + + /* Can we just shift base on the page? */ + if (xid < base + FirstNormalTransactionId) + { + freeDelta = MaxShortTransactionId - max; + requiredDelta = (base + FirstNormalTransactionId) - xid; + /* Shouldn't consider setting base less than 0 */ + freeDelta = Min(freeDelta, base); + + if (requiredDelta > freeDelta) + return -1; + + delta = -(freeDelta + requiredDelta) / 2; + } + else + { + freeDelta = min - FirstNormalTransactionId; + requiredDelta = xid - (base + MaxShortTransactionId); + + if (requiredDelta > freeDelta) + return -1; + + delta = (freeDelta + requiredDelta) / 2; + } + + heap_page_check_delta(buffer, xid, base, min, max, + delta, &freeDelta, &requiredDelta); + heap_page_apply_delta(relation, buffer, page, xid, multi, base, + delta, is_toast); + + return 0; +} + +/* + * Ensure that given xid fits base of given page. + */ +void +rewrite_page_prepare_for_xid(Page page, HeapTuple tup, bool is_toast) +{ + TransactionId xid; + int res; + + /* xmin */ + xid = HeapTupleGetXmin(tup); + if (TransactionIdIsNormal(xid)) + { + res = heap_page_try_prepare_for_xid(NULL, InvalidBuffer, page, xid, + false, is_toast); + if (res == -1) + elog(ERROR, "could not fit xid into page"); + } + + /* xmax */ + xid = HeapTupleGetRawXmax(tup); + if (TransactionIdIsNormal(xid)) + { + res = heap_page_try_prepare_for_xid(NULL, InvalidBuffer, page, xid, + tup->t_data->t_infomask & HEAP_XMAX_IS_MULTI, + is_toast); + if (res == -1) + elog(ERROR, "could not fit xid into page"); + } +} + + /* * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the * tuple header fields and toasts the tuple if necessary. Returns a toasted @@ -2211,7 +2776,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * that in any case, the header fields are also set in the original tuple. */ static HeapTuple -heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, +heap_prepare_insert(Relation relation, HeapTuple tup, CommandId cid, int options) { /* @@ -2228,12 +2793,12 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); tup->t_data->t_infomask |= HEAP_XMAX_INVALID; - HeapTupleHeaderSetXmin(tup->t_data, xid); + HeapTupleSetXmin(tup, InvalidTransactionId); if (options & HEAP_INSERT_FROZEN) HeapTupleHeaderSetXminFrozen(tup->t_data); HeapTupleHeaderSetCmin(tup->t_data, cid); - HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ + HeapTupleSetXmax(tup, 0); /* for cleanliness */ tup->t_tableOid = RelationGetRelid(relation); /* @@ -2296,8 +2861,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL); slots[i]->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slots[i]->tts_tableOid; - heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid, - options); + heaptuples[i] = heap_prepare_insert(relation, tuple, cid, options); } /* @@ -2353,6 +2917,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) all_frozen_set = true; + heap_page_prepare_for_xid(relation, buffer, xid, false); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -2360,6 +2926,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, * RelationGetBufferForTuple has ensured that the first tuple fits. * Put that on the page, and then as many other tuples as fit. */ + HeapTupleSetXmin(heaptuples[ndone], xid); RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); /* @@ -2376,6 +2943,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; + HeapTupleSetXmin(heaptup, xid); RelationPutHeapTuple(relation, buffer, heaptup, false); /* @@ -2511,6 +3079,17 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, bufflags |= REGBUF_KEEP_DATA; XLogBeginInsert(); + + if (info & XLOG_HEAP_INIT_PAGE) + { + char *base; + + base = IsToastRelation(relation) ? + (char *) &ToastPageGetSpecial(page)->pd_xid_base : + (char *) &HeapPageGetSpecial(page)->pd_xid_base; + XLogRegisterData(base, sizeof(TransactionId)); + } + XLogRegisterData((char *) xlrec, tupledata - scratch.data); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); @@ -2718,6 +3297,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); tp.t_len = ItemIdGetLength(lp); tp.t_self = *tid; + HeapTupleCopyBaseFromPage(buffer, &tp, page, IsToastRelation(relation)); l1: /* @@ -2748,7 +3328,7 @@ l1: uint16 infomask; /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(tp.t_data); + xwait = HeapTupleGetRawXmax(&tp); infomask = tp.t_data->t_infomask; /* @@ -2787,6 +3367,10 @@ l1: NULL); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + /* Copy possibly updated xid base after relocking */ + HeapTupleCopyBaseFromPage(buffer, &tp, page, + IsToastRelation(relation)); + /* * If xwait had just locked the tuple then some other xact * could update this tuple before we get to this point. Check @@ -2797,7 +3381,7 @@ l1: */ if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || xmax_infomask_changed(tp.t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) goto l1; } @@ -2824,6 +3408,10 @@ l1: XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + /* Copy possibly updated xid base after relocking */ + HeapTupleCopyBaseFromPage(buffer, &tp, page, + IsToastRelation(relation)); + /* * xwait is done, but if xwait had just locked the tuple then some * other xact could update this tuple before we get to this point. @@ -2834,7 +3422,7 @@ l1: */ if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || xmax_infomask_changed(tp.t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) goto l1; @@ -2848,7 +3436,7 @@ l1: */ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tp.t_data)) + HeapTupleIsOnlyLocked(&tp)) result = TM_Ok; else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) result = TM_Updated; @@ -2873,9 +3461,9 @@ l1: Assert(result != TM_Updated || !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); tmfd->ctid = tp.t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); + tmfd->xmax = HeapTupleGetUpdateXidAny(&tp); if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); + tmfd->cmax = HeapTupleGetCmax(&tp); else tmfd->cmax = InvalidCommandId; UnlockReleaseBuffer(buffer); @@ -2898,7 +3486,7 @@ l1: CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); /* replace cid with a combo CID if necessary */ - HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); + HeapTupleHeaderAdjustCmax(&tp, &cid, &iscombo); /* * Compute replica identity tuple before entering the critical section so @@ -2916,11 +3504,20 @@ l1: */ MultiXactIdSetOldestMember(); - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data), + compute_new_xmax_infomask(HeapTupleGetRawXmax(&tp), tp.t_data->t_infomask, tp.t_data->t_infomask2, xid, LockTupleExclusive, true, &new_xmax, &new_infomask, &new_infomask2); +#ifdef USE_ASSERT_CHECKING + if (IsToastRelation(relation)) + Assert((new_infomask & HEAP_XMAX_IS_MULTI) == false); +#endif + + heap_page_prepare_for_xid(relation, buffer, new_xmax, + (new_infomask & HEAP_XMAX_IS_MULTI) ? true : false); + HeapTupleCopyBaseFromPage(buffer, &tp, page, IsToastRelation(relation)); + START_CRIT_SECTION(); /* @@ -2930,7 +3527,7 @@ l1: * the subsequent page pruning will be a no-op and the hint will be * cleared. */ - PageSetPrunable(page, xid); + PageSetPrunable(page, xid, IsToastRelation(relation)); if (PageIsAllVisible(page)) { @@ -2946,10 +3543,15 @@ l1: tp.t_data->t_infomask |= new_infomask; tp.t_data->t_infomask2 |= new_infomask2; HeapTupleHeaderClearHotUpdated(tp.t_data); - HeapTupleHeaderSetXmax(tp.t_data, new_xmax); + HeapTupleSetXmax(&tp, new_xmax); + if (IsToastRelation(relation)) + ToastTupleHeaderSetXmax(page, &tp); + else + HeapTupleHeaderSetXmax(page, &tp); HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); /* Make sure there is no forward chain link in t_ctid */ tp.t_data->t_ctid = tp.t_self; + HeapTupleCopyBaseFromPage(buffer, &tp, page, IsToastRelation(relation)); /* Signal that this is actually a move into another partition */ if (changingPart) @@ -2985,6 +3587,8 @@ l1: tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; + if (IsToastRelation(relation)) + xlrec.flags |= XLH_DELETE_PAGE_ON_TOAST_RELATION; if (old_key_tuple != NULL) { @@ -3140,7 +3744,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, HeapTuple heaptup; HeapTuple old_key_tuple = NULL; bool old_key_copied = false; - Page page; + Page page, + newpage; BlockNumber block; MultiXactStatus mxact_status; Buffer buffer, @@ -3233,6 +3838,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); oldtup.t_len = ItemIdGetLength(lp); oldtup.t_self = *otid; + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, IsToastRelation(relation)); /* the new tuple is ready, except for this: */ newtup->t_tableOid = RelationGetRelid(relation); @@ -3326,7 +3932,7 @@ l2: */ /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); + xwait = HeapTupleGetRawXmax(&oldtup); infomask = oldtup.t_data->t_infomask; /* @@ -3377,6 +3983,8 @@ l2: checked_lockers = true; locker_remains = remain != 0; LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); /* * If xwait had just locked the tuple then some other xact @@ -3385,7 +3993,7 @@ l2: */ if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(&oldtup), xwait)) goto l2; } @@ -3411,7 +4019,7 @@ l2: * subxact aborts. */ if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask)) - update_xact = HeapTupleGetUpdateXid(oldtup.t_data); + update_xact = HeapTupleGetUpdateXid(&oldtup); else update_xact = InvalidTransactionId; @@ -3459,6 +4067,9 @@ l2: checked_lockers = true; LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); + /* * xwait is done, but if xwait had just locked the tuple then some * other xact could update this tuple before we get to this point. @@ -3466,7 +4077,7 @@ l2: */ if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || !TransactionIdEquals(xwait, - HeapTupleHeaderGetRawXmax(oldtup.t_data))) + HeapTupleGetRawXmax(&oldtup))) goto l2; /* Otherwise check if it committed or aborted */ @@ -3503,9 +4114,9 @@ l2: Assert(result != TM_Updated || !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); tmfd->ctid = oldtup.t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); + tmfd->xmax = HeapTupleGetUpdateXidAny(&oldtup); if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); + tmfd->cmax = HeapTupleGetCmax(&oldtup); else tmfd->cmax = InvalidCommandId; UnlockReleaseBuffer(buffer); @@ -3535,6 +4146,8 @@ l2: LockBuffer(buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); goto l2; } @@ -3544,7 +4157,7 @@ l2: * If the tuple we're updating is locked, we need to preserve the locking * info in the old tuple's Xmax. Prepare a new Xmax value for this. */ - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + compute_new_xmax_infomask(HeapTupleGetRawXmax(&oldtup), oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2, xid, *lockmode, true, @@ -3563,7 +4176,7 @@ l2: (checked_lockers && !locker_remains)) xmax_new_tuple = InvalidTransactionId; else - xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data); + xmax_new_tuple = HeapTupleGetRawXmax(&oldtup); if (!TransactionIdIsValid(xmax_new_tuple)) { @@ -3596,17 +4209,15 @@ l2: */ newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); - HeapTupleHeaderSetXmin(newtup->t_data, xid); HeapTupleHeaderSetCmin(newtup->t_data, cid); newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; newtup->t_data->t_infomask2 |= infomask2_new_tuple; - HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple); /* * Replace cid with a combo CID if necessary. Note that we already put * the plain cid into the new tuple. */ - HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo); + HeapTupleHeaderAdjustCmax(&oldtup, &cid, &iscombo); /* * If the toaster needs to be activated, OR if the new tuple will not fit @@ -3636,7 +4247,7 @@ l2: newtupsize = MAXALIGN(newtup->t_len); - if (need_toast || newtupsize > pagefree) + if (need_toast || newtupsize > pagefree || HeapPageIsDoubleXmax(page)) { TransactionId xmax_lock_old_tuple; uint16 infomask_lock_old_tuple, @@ -3661,7 +4272,7 @@ l2: * updating, because the potentially created multixact would otherwise * be wrong. */ - compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), + compute_new_xmax_infomask(HeapTupleGetRawXmax(&oldtup), oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2, xid, *lockmode, false, @@ -3670,6 +4281,16 @@ l2: Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); +#ifdef USE_ASSERT_CHECKING + if (IsToastRelation(relation)) + Assert((infomask_lock_old_tuple & HEAP_XMAX_IS_MULTI) == false); +#endif + + heap_page_prepare_for_xid(relation, buffer, xmax_lock_old_tuple, + (infomask_lock_old_tuple & HEAP_XMAX_IS_MULTI) ? true : false); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); + START_CRIT_SECTION(); /* Clear obsolete visibility flags ... */ @@ -3678,10 +4299,13 @@ l2: HeapTupleClearHotUpdated(&oldtup); /* ... and store info about transaction updating this tuple */ Assert(TransactionIdIsValid(xmax_lock_old_tuple)); - HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple); oldtup.t_data->t_infomask |= infomask_lock_old_tuple; oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple; + HeapTupleSetXmax(&oldtup, xmax_lock_old_tuple); + HeapTupleHeaderSetXmax(page, &oldtup); HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); /* temporarily make it look not-updated, but locked */ oldtup.t_data->t_ctid = oldtup.t_self; @@ -3763,7 +4387,11 @@ l2: */ for (;;) { - if (newtupsize > pagefree) + /* + * We can't fit new tuple to "double xmax" page, since it's + * impossible to set xmin there. + */ + if (newtupsize > pagefree || HeapPageIsDoubleXmax(page)) { /* It doesn't fit, must use RelationGetBufferForTuple. */ newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, @@ -3796,6 +4424,10 @@ l2: break; } } + + /* Copy possibly updated xid base to old tuple after relocking */ + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); } else { @@ -3855,6 +4487,48 @@ l2: id_has_external, &old_key_copied); + newpage = BufferGetPage(newbuf); + + /* + * Prepare pages for the current xid, that witten to the new tuple's Xmax + * and old page's pd_prune_xid. + */ + heap_page_prepare_for_xid(relation, buffer, xid, false); + if (newbuf != buffer) + heap_page_prepare_for_xid(relation, newbuf, xid, false); + +#ifdef USE_ASSERT_CHECKING + if (IsToastRelation(relation)) + { + Assert((infomask_old_tuple & HEAP_XMAX_IS_MULTI) == false); + Assert((heaptup->t_data->t_infomask & HEAP_XMAX_IS_MULTI) == false); + } +#endif + + /* Prepare pages for tuple's Xmax */ + heap_page_prepare_for_xid(relation, buffer, xmax_old_tuple, + (infomask_old_tuple & HEAP_XMAX_IS_MULTI) ? true : false); + heap_page_prepare_for_xid(relation, newbuf, xmax_new_tuple, + (heaptup->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ? true : false); + + /* Copy possibly updated Xid bases to the both tuples. */ + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, + IsToastRelation(relation)); + + /* + * Set new tuple's Xmin/Xmax, old tuple's Xmin/Xmax were already shifted. + */ + HeapTupleSetXmin(heaptup, xid); + if (IsToastRelation(relation)) + ToastTupleHeaderSetXmin(newpage, heaptup); + else + HeapTupleHeaderSetXmin(newpage, heaptup); + HeapTupleSetXmax(heaptup, xmax_new_tuple); + if (IsToastRelation(relation)) + ToastTupleHeaderSetXmax(newpage, heaptup); + else + HeapTupleHeaderSetXmax(newpage, heaptup); + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -3870,7 +4544,9 @@ l2: * not to optimize for aborts. Note that heap_xlog_update must be kept in * sync if this decision changes. */ - PageSetPrunable(page, xid); + + Assert(!IsToastRelation(relation)); + PageSetPrunable(page, xid, false); if (use_hot_update) { @@ -3897,10 +4573,12 @@ l2: oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; /* ... and store info about transaction updating this tuple */ Assert(TransactionIdIsValid(xmax_old_tuple)); - HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple); oldtup.t_data->t_infomask |= infomask_old_tuple; oldtup.t_data->t_infomask2 |= infomask2_old_tuple; + HeapTupleSetXmax(&oldtup, xmax_old_tuple); + HeapTupleHeaderSetXmax(page, &oldtup); HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); + HeapTupleCopyBaseFromPage(buffer, &oldtup, page, IsToastRelation(relation)); /* record address of new tuple in t_ctid of old one */ oldtup.t_data->t_ctid = heaptup->t_self; @@ -3954,6 +4632,20 @@ l2: END_CRIT_SECTION(); + if (newtup != heaptup) + { + /* + * Set new tuple's Xmin/Xmax only after both xid base preparations. + * Old tuple's Xmin/Xmax were already shifted because old tuple is on + * the page. + */ + HeapTupleCopyBase(newtup, heaptup); + HeapTupleSetXmin(newtup, xid); + HeapTupleHeaderSetXmin(newpage, newtup); + HeapTupleSetXmax(newtup, xmax_new_tuple); + HeapTupleHeaderSetXmax(newpage, newtup); + } + if (newbuf != buffer) LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); @@ -4292,6 +4984,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, IsToastRelation(relation)); l3: result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); @@ -4318,7 +5011,7 @@ l3: ItemPointerData t_ctid; /* must copy state data before unlocking buffer */ - xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); + xwait = HeapTupleGetRawXmax(tuple); infomask = tuple->t_data->t_infomask; infomask2 = tuple->t_data->t_infomask2; ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); @@ -4476,11 +5169,15 @@ l3: result = res; /* recovery code expects to have buffer lock held */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } } LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); /* * Make sure it's still an appropriate lock, else start over. @@ -4489,7 +5186,7 @@ l3: * now need to follow the update chain to lock the new * versions. */ - if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && + if (!HeapTupleIsOnlyLocked(tuple) && ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || !updated)) goto l3; @@ -4516,6 +5213,8 @@ l3: !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); /* * Make sure it's still an appropriate lock, else start over. @@ -4544,8 +5243,11 @@ l3: * meantime, start over. */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) goto l3; @@ -4556,10 +5258,12 @@ l3: else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); /* if the xmax changed in the meantime, start over */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) goto l3; /* otherwise, we're good */ @@ -4584,8 +5288,11 @@ l3: { /* ... but if the xmax changed in the meantime, start over */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); + if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) goto l3; Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); @@ -4606,6 +5313,8 @@ l3: if (require_sleep && (result == TM_Updated || result == TM_Deleted)) { LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } else if (require_sleep) @@ -4631,6 +5340,8 @@ l3: result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } @@ -4657,6 +5368,8 @@ l3: result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } break; @@ -4697,6 +5410,8 @@ l3: result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } break; @@ -4723,11 +5438,15 @@ l3: result = res; /* recovery code expects to have buffer lock held */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto failed; } } LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); /* * xwait is done, but if xwait had just locked the tuple then some @@ -4735,7 +5454,7 @@ l3: * Check for xmax change, and start over if so. */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || - !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) goto l3; @@ -4763,7 +5482,7 @@ l3: if (!require_sleep || (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple->t_data)) + HeapTupleIsOnlyLocked(tuple)) result = TM_Ok; else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) result = TM_Updated; @@ -4789,9 +5508,9 @@ failed: Assert(result != TM_Updated || !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); tmfd->ctid = tuple->t_data->t_ctid; - tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + tmfd->xmax = HeapTupleGetUpdateXidAny(tuple); if (result == TM_SelfModified) - tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); + tmfd->cmax = HeapTupleGetCmax(tuple); else tmfd->cmax = InvalidCommandId; goto out_locked; @@ -4811,10 +5530,12 @@ failed: LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, + IsToastRelation(relation)); goto l3; } - xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); + xmax = HeapTupleGetRawXmax(tuple); old_infomask = tuple->t_data->t_infomask; /* @@ -4836,6 +5557,15 @@ failed: GetCurrentTransactionId(), mode, false, &xid, &new_infomask, &new_infomask2); +#ifdef USE_ASSERT_CHECKING + if (IsToastRelation(relation)) + Assert((new_infomask & HEAP_XMAX_IS_MULTI) == false); +#endif + + heap_page_prepare_for_xid(relation, *buffer, xid, + (new_infomask & HEAP_XMAX_IS_MULTI) ? true : false); + HeapTupleCopyBaseFromPage(*buffer, tuple, page, IsToastRelation(relation)); + START_CRIT_SECTION(); /* @@ -4854,7 +5584,8 @@ failed: tuple->t_data->t_infomask2 |= new_infomask2; if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) HeapTupleHeaderClearHotUpdated(tuple->t_data); - HeapTupleHeaderSetXmax(tuple->t_data, xid); + HeapTupleSetXmax(tuple, xid); + HeapTupleHeaderSetXmax(page, tuple); /* * Make sure there is no forward chain link in t_ctid. Note that in the @@ -5448,12 +6179,19 @@ l4: LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); } + /* + * Copy xid base after buffer relocking, it could have changed since + * heap_fetch(). + */ + HeapTupleCopyBaseFromPage(buf, &mytup, BufferGetPage(buf), + IsToastRelation(rel)); + /* * Check the tuple XMIN against prior XMAX, if any. If we reached the * end of the chain, we're done, so return success. */ if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), + !TransactionIdEquals(HeapTupleGetXmin(&mytup), priorXmax)) { result = TM_Ok; @@ -5465,7 +6203,7 @@ l4: * (sub)transaction, then we already locked the last live one in the * chain, thus we're done, so return success. */ - if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) + if (TransactionIdDidAbort(HeapTupleGetXmin(&mytup))) { result = TM_Ok; goto out_locked; @@ -5473,7 +6211,7 @@ l4: old_infomask = mytup.t_data->t_infomask; old_infomask2 = mytup.t_data->t_infomask2; - xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + xmax = HeapTupleGetRawXmax(&mytup); /* * If this tuple version has been updated or locked by some concurrent @@ -5486,7 +6224,7 @@ l4: TransactionId rawxmax; bool needwait; - rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); + rawxmax = HeapTupleGetRawXmax(&mytup); if (old_infomask & HEAP_XMAX_IS_MULTI) { int nmembers; @@ -5627,14 +6365,25 @@ l4: VISIBILITYMAP_ALL_FROZEN)) cleared_all_frozen = true; +#ifdef USE_ASSERT_CHECKING + if (IsToastRelation(rel)) + Assert((new_infomask & HEAP_XMAX_IS_MULTI) == false); +#endif + + heap_page_prepare_for_xid(rel, buf, new_xmax, + (new_infomask & HEAP_XMAX_IS_MULTI) ? true : false); + HeapTupleCopyBaseFromPage(buf, &mytup, BufferGetPage(buf), + IsToastRelation(rel)); + START_CRIT_SECTION(); /* ... and set them */ - HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; mytup.t_data->t_infomask |= new_infomask; mytup.t_data->t_infomask2 |= new_infomask2; + HeapTupleSetXmax(&mytup, new_xmax); + HeapTupleHeaderSetXmax(BufferGetPage(buf), &mytup); MarkBufferDirty(buf); @@ -5668,14 +6417,14 @@ next: if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) || ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || - HeapTupleHeaderIsOnlyLocked(mytup.t_data)) + HeapTupleIsOnlyLocked(&mytup)) { result = TM_Ok; goto out_locked; } /* tail recursion */ - priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); + priorXmax = HeapTupleGetUpdateXidAny(&mytup); ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); UnlockReleaseBuffer(buf); } @@ -5882,12 +6631,13 @@ heap_abort_speculative(Relation relation, ItemPointer tid) tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); tp.t_len = ItemIdGetLength(lp); tp.t_self = *tid; + HeapTupleCopyBaseFromPage(buffer, &tp, page, IsToastRelation(relation)); /* * Sanity check that the tuple really is a speculatively inserted tuple, * inserted by us. */ - if (tp.t_data->t_choice.t_heap.t_xmin != xid) + if (HeapTupleGetRawXmin(&tp) != xid) elog(ERROR, "attempted to kill a tuple inserted by another transaction"); if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) elog(ERROR, "attempted to kill a non-speculative tuple"); @@ -5916,7 +6666,9 @@ heap_abort_speculative(Relation relation, ItemPointer tid) prune_xid = relation->rd_rel->relfrozenxid; else prune_xid = TransactionXmin; - PageSetPrunable(page, prune_xid); + Assert(TransactionIdIsValid(prune_xid)); + heap_page_prepare_for_xid(relation, buffer, prune_xid, false); + PageSetPrunable(page, prune_xid, IsToastRelation(relation)); /* store transaction information of xact deleting the tuple */ tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); @@ -5925,9 +6677,15 @@ heap_abort_speculative(Relation relation, ItemPointer tid) /* * Set the tuple header xmin to InvalidTransactionId. This makes the * tuple immediately invisible everyone. (In particular, to any - * transactions waiting on the speculative token, woken up later.) + * transactions waiting on the speculative token, woken up later.) Don't + * need to reload xid base from page because InvalidTransactionId doesn't + * require xid base to be valid. */ - HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); + HeapTupleSetXmin(&tp, InvalidTransactionId); + if (IsToastRelation(relation)) + ToastTupleHeaderSetXmin(page, &tp); + else + HeapTupleHeaderSetXmin(page, &tp); /* Clear the speculative insertion token too */ tp.t_data->t_ctid = tp.t_self; @@ -5946,6 +6704,8 @@ heap_abort_speculative(Relation relation, ItemPointer tid) XLogRecPtr recptr; xlrec.flags = XLH_DELETE_IS_SUPER; + if (IsToastRelation(relation)) + xlrec.flags |= XLH_DELETE_PAGE_ON_TOAST_RELATION; xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); @@ -6275,7 +7035,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * individual members might even show that we don't need to keep anything. */ nnewmembers = 0; - newmembers = palloc(sizeof(MultiXactMember) * nmembers); + newmembers = palloc0(sizeof(MultiXactMember) * nmembers); has_lockers = false; update_xid = InvalidTransactionId; update_committed = false; @@ -6475,7 +7235,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * The *frz WAL record we output completely removes all old XIDs during REDO. */ bool -heap_prepare_freeze_tuple(HeapTupleHeader tuple, +heap_prepare_freeze_tuple(HeapTuple htup, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi, xl_heap_freeze_tuple *frz, bool *totally_frozen, @@ -6487,11 +7247,12 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, bool xmin_frozen; bool freeze_xmax; TransactionId xid; + HeapTupleHeader tuple = htup->t_data; frz->frzflags = 0; frz->t_infomask2 = tuple->t_infomask2; frz->t_infomask = tuple->t_infomask; - frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + frz->xmax = HeapTupleGetRawXmax(htup); /* * Process xmin. xmin_frozen has two slightly different meanings: in the @@ -6503,7 +7264,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, * handling, since either way the tuple's xmin will be a permanent value * once we're done with it. */ - xid = HeapTupleHeaderGetXmin(tuple); + xid = HeapTupleGetXmin(htup); if (!TransactionIdIsNormal(xid)) xmin_frozen = true; else @@ -6545,7 +7306,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, * * Make sure to keep heap_tuple_would_freeze in sync with this. */ - xid = HeapTupleHeaderGetRawXmax(tuple); + xid = HeapTupleGetRawXmax(htup); if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { @@ -6645,6 +7406,15 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, Assert(!TransactionIdIsValid(newxmax)); } } + else if ((tuple->t_infomask & HEAP_XMAX_INVALID) && + TransactionIdIsNormal(xid)) + { + /* + * To reset xmax without reading clog. + * This prevent excess growth of xmax. + */ + freeze_xmax = true; + } else if (TransactionIdIsNormal(xid)) { if (TransactionIdPrecedes(xid, relfrozenxid)) @@ -6679,7 +7449,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, } } else if ((tuple->t_infomask & HEAP_XMAX_INVALID) || - !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) + !TransactionIdIsValid(HeapTupleGetRawXmax(htup))) { freeze_xmax = false; xmax_already_frozen = true; @@ -6775,18 +7545,35 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, * NB: All code in here must be safe to execute during crash recovery! */ void -heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) +heap_execute_freeze_tuple(HeapTuple htup, xl_heap_freeze_tuple *frz) { - HeapTupleHeaderSetXmax(tuple, frz->xmax); + HeapTupleHeader tuple = htup->t_data; + + tuple->t_infomask = frz->t_infomask; + tuple->t_infomask2 = frz->t_infomask2; + + HeapTupleSetXmax(htup, frz->xmax); if (frz->frzflags & XLH_FREEZE_XVAC) HeapTupleHeaderSetXvac(tuple, FrozenTransactionId); if (frz->frzflags & XLH_INVALID_XVAC) HeapTupleHeaderSetXvac(tuple, InvalidTransactionId); +} - tuple->t_infomask = frz->t_infomask; - tuple->t_infomask2 = frz->t_infomask2; +void +heap_execute_freeze_tuple_page(Page page, HeapTupleHeader htup, + xl_heap_freeze_tuple *frz, bool is_toast) +{ + HeapTupleData tuple; + + tuple.t_data = htup; + heap_execute_freeze_tuple(&tuple, frz); + + if (is_toast) + ToastTupleHeaderSetXmax(page, &tuple); + else + HeapTupleHeaderSetXmax(page, &tuple); } /* @@ -6796,7 +7583,7 @@ heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) * Useful for callers like CLUSTER that perform their own WAL logging. */ bool -heap_freeze_tuple(HeapTupleHeader tuple, +heap_freeze_tuple(HeapTuple tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi) { @@ -6963,10 +7750,10 @@ MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask) * checking the hint bits. */ TransactionId -HeapTupleGetUpdateXid(HeapTupleHeader tuple) +HeapTupleGetUpdateXid(HeapTuple tuple) { - return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple), - tuple->t_infomask); + return MultiXactIdGetUpdateXid(HeapTupleGetRawXmax(tuple), + tuple->t_data->t_infomask); } /* @@ -7192,15 +7979,18 @@ ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, * will eventually require freezing (if tuple isn't removed by pruning first). */ bool -heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) +heap_tuple_needs_eventual_freeze(HeapTuple htup) { TransactionId xid; + HeapTupleHeader tuple; + + tuple = htup->t_data; /* * If xmin is a normal transaction ID, this tuple is definitely not * frozen. */ - xid = HeapTupleHeaderGetXmin(tuple); + xid = HeapTupleGetXmin(htup); if (TransactionIdIsNormal(xid)) return true; @@ -7211,13 +8001,13 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) { MultiXactId multi; - multi = HeapTupleHeaderGetRawXmax(tuple); + multi = HeapTupleGetRawXmax(htup); if (MultiXactIdIsValid(multi)) return true; } else { - xid = HeapTupleHeaderGetRawXmax(tuple); + xid = HeapTupleGetRawXmax(htup); if (TransactionIdIsNormal(xid)) return true; } @@ -7245,7 +8035,7 @@ heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) * never freeze here, which makes tracking the oldest extant XID/MXID simple. */ bool -heap_tuple_would_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, +heap_tuple_would_freeze(HeapTuple htup, TransactionId cutoff_xid, MultiXactId cutoff_multi, TransactionId *relfrozenxid_out, MultiXactId *relminmxid_out) @@ -7253,9 +8043,10 @@ heap_tuple_would_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, TransactionId xid; MultiXactId multi; bool would_freeze = false; + HeapTupleHeader tuple = htup->t_data; /* First deal with xmin */ - xid = HeapTupleHeaderGetXmin(tuple); + xid = HeapTupleGetXmin(htup); if (TransactionIdIsNormal(xid)) { if (TransactionIdPrecedes(xid, *relfrozenxid_out)) @@ -7268,9 +8059,9 @@ heap_tuple_would_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, xid = InvalidTransactionId; multi = InvalidMultiXactId; if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) - multi = HeapTupleHeaderGetRawXmax(tuple); + multi = HeapTupleGetRawXmax(htup); else - xid = HeapTupleHeaderGetRawXmax(tuple); + xid = HeapTupleGetRawXmax(htup); if (TransactionIdIsNormal(xid)) { @@ -7343,14 +8134,14 @@ heap_tuple_would_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, * with queries. */ void -HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, +HeapTupleHeaderAdvanceLatestRemovedXid(HeapTuple tuple, TransactionId *latestRemovedXid) { - TransactionId xmin = HeapTupleHeaderGetXmin(tuple); - TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + TransactionId xmin = HeapTupleGetXmin(tuple); + TransactionId xmax = HeapTupleGetUpdateXidAny(tuple); + TransactionId xvac = HeapTupleHeaderGetXvac(tuple->t_data); - if (tuple->t_infomask & HEAP_MOVED) + if (tuple->t_data->t_infomask & HEAP_MOVED) { if (TransactionIdPrecedes(*latestRemovedXid, xvac)) *latestRemovedXid = xvac; @@ -7362,8 +8153,8 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, * * Look for a committed hint bit, or if no xmin bit is set, check clog. */ - if (HeapTupleHeaderXminCommitted(tuple) || - (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) + if (HeapTupleHeaderXminCommitted(tuple->t_data) || + (!HeapTupleHeaderXminInvalid(tuple->t_data) && TransactionIdDidCommit(xmin))) { if (xmax != xmin && TransactionIdFollows(xmax, *latestRemovedXid)) @@ -7713,7 +8504,7 @@ heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) for (;;) { ItemId lp; - HeapTupleHeader htup; + HeapTupleData htup; /* Sanity check (pure paranoia) */ if (offnum < FirstOffsetNumber) @@ -7750,16 +8541,18 @@ heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) if (!ItemIdIsNormal(lp)) break; - htup = (HeapTupleHeader) PageGetItem(page, lp); + htup.t_data = (HeapTupleHeader) PageGetItem(page, lp); + htup.t_len = ItemIdGetLength(lp); + HeapTupleCopyBaseFromPage(buf, &htup, page, IsToastRelation(rel)); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + !TransactionIdEquals(HeapTupleGetXmin(&htup), priorXmax)) break; - HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid); + HeapTupleHeaderAdvanceLatestRemovedXid(&htup, &latestRemovedXid); /* * If the tuple is not HOT-updated, then we are at the end of this @@ -7767,13 +8560,13 @@ heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) * chain (they get their own index entries) -- just move on to * next htid from index AM caller. */ - if (!HeapTupleHeaderIsHotUpdated(htup)) + if (!HeapTupleHeaderIsHotUpdated(htup.t_data)) break; /* Advance to next HOT chain member */ - Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno); - offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); + Assert(ItemPointerGetBlockNumber(&htup.t_data->t_ctid) == blkno); + offnum = ItemPointerGetOffsetNumber(&htup.t_data->t_ctid); + priorXmax = HeapTupleGetUpdateXidAny(&htup); } /* Enable further/final shrinking of deltids for caller */ @@ -8179,6 +8972,9 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, xlrec.cutoff_xid = cutoff_xid; xlrec.ntuples = ntuples; + xlrec.flags = 0; + if (IsToastRelation(reln)) + xlrec.flags |= XLH_FREEZE_PAGE_ON_TOAST_RELATION; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); @@ -8353,13 +9149,13 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare WAL data for the old page */ xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); - xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); + xlrec.old_xmax = HeapTupleGetRawXmax(oldtup); xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, oldtup->t_data->t_infomask2); /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); - xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + xlrec.new_xmax = HeapTupleGetRawXmax(newtup); bufflags = REGBUF_STANDARD; if (init) @@ -8371,6 +9167,10 @@ log_heap_update(Relation reln, Buffer oldbuf, if (oldbuf != newbuf) XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); + if (info & XLOG_HEAP_INIT_PAGE) + XLogRegisterData((char *) &HeapPageGetSpecial(page)->pd_xid_base, + sizeof(TransactionId)); + XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); /* @@ -8483,8 +9283,8 @@ log_heap_new_cid(Relation relation, HeapTuple tup) { Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID)); Assert(!HeapTupleHeaderXminInvalid(hdr)); - xlrec.cmin = HeapTupleHeaderGetCmin(hdr); - xlrec.cmax = HeapTupleHeaderGetCmax(hdr); + xlrec.cmin = HeapTupleGetCmin(tup); + xlrec.cmax = HeapTupleGetCmax(tup); xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr); } /* No combo CID, so only cmin or cmax can be set by this TX */ @@ -8686,7 +9486,9 @@ heap_xlog_prune(XLogReaderState *record) heap_page_prune_execute(buffer, redirected, nredirected, nowdead, ndead, - nowunused, nunused); + nowunused, nunused, + xlrec->flags & XLH_PRUNE_REPAIR_FRAGMENTATION, + xlrec->flags & XLH_PRUNE_ON_TOAST_RELATION); /* * Note: we don't worry about updating the page's prunability hints. @@ -8978,7 +9780,8 @@ heap_xlog_freeze_page(XLogReaderState *record) lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ tuple = (HeapTupleHeader) PageGetItem(page, lp); - heap_execute_freeze_tuple(tuple, xlrec_tp); + heap_execute_freeze_tuple_page(page, tuple, xlrec_tp, + xlrec->flags & XLH_FREEZE_PAGE_ON_TOAST_RELATION); } PageSetLSN(page, lsn); @@ -9049,6 +9852,8 @@ heap_xlog_delete(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + HeapTupleData tuple; + page = BufferGetPage(buffer); if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) @@ -9064,14 +9869,29 @@ heap_xlog_delete(XLogReaderState *record) HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); + tuple.t_data = htup; + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) - HeapTupleHeaderSetXmax(htup, xlrec->xmax); + { + HeapTupleSetXmax(&tuple, xlrec->xmax); + if (xlrec->flags & XLH_DELETE_PAGE_ON_TOAST_RELATION) + ToastTupleHeaderSetXmax(page, &tuple); + else + HeapTupleHeaderSetXmax(page, &tuple); + } else - HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + { + HeapTupleSetXmin(&tuple, InvalidTransactionId); + if (xlrec->flags & XLH_DELETE_PAGE_ON_TOAST_RELATION) + ToastTupleHeaderSetXmin(page, &tuple); + else + HeapTupleHeaderSetXmin(page, &tuple); + } HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, XLogRecGetXid(record)); + PageSetPrunable(page, XLogRecGetXid(record), + xlrec->flags & XLH_DELETE_PAGE_ON_TOAST_RELATION); if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); @@ -9092,7 +9912,7 @@ static void heap_xlog_insert(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; - xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record); + xl_heap_insert *xlrec; Buffer buffer; Page page; union @@ -9108,6 +9928,17 @@ heap_xlog_insert(XLogReaderState *record) BlockNumber blkno; ItemPointerData target_tid; XLogRedoAction action; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + TransactionId pd_xid_base = InvalidTransactionId; + Pointer rec_data = (Pointer) XLogRecGetData(record); + + if (isinit) + { + pd_xid_base = *((TransactionId *) rec_data); + rec_data += sizeof(TransactionId); + } + + xlrec = (xl_heap_insert *) rec_data; XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); @@ -9132,11 +9963,24 @@ heap_xlog_insert(XLogReaderState *record) * If we inserted the first and only tuple on the page, re-initialize the * page from scratch. */ - if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + if (isinit) { buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); - PageInit(page, BufferGetPageSize(buffer), 0); + + if (xlrec->flags & XLH_INSERT_ON_TOAST_RELATION) + { + PageInit(page, BufferGetPageSize(buffer), + sizeof(ToastPageSpecialData)); + ToastPageGetSpecial(page)->pd_xid_base = pd_xid_base; + } + else + { + PageInit(page, BufferGetPageSize(buffer), + sizeof(HeapPageSpecialData)); + HeapPageGetSpecial(page)->pd_xid_base = pd_xid_base; + } + action = BLK_NEEDS_REDO; } else @@ -9145,6 +9989,7 @@ heap_xlog_insert(XLogReaderState *record) { Size datalen; char *data; + HeapTupleData tuple; page = BufferGetPage(buffer); @@ -9168,7 +10013,12 @@ heap_xlog_insert(XLogReaderState *record) htup->t_infomask2 = xlhdr.t_infomask2; htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + tuple.t_data = htup; + HeapTupleSetXmin(&tuple, XLogRecGetXid(record)); + if (xlrec->flags & XLH_INSERT_ON_TOAST_RELATION) + ToastTupleHeaderSetXmin(page, &tuple); + else + HeapTupleHeaderSetXmin(page, &tuple); HeapTupleHeaderSetCmin(htup, FirstCommandId); htup->t_ctid = target_tid; @@ -9189,6 +10039,7 @@ heap_xlog_insert(XLogReaderState *record) MarkBufferDirty(buffer); } + if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); @@ -9228,12 +10079,19 @@ heap_xlog_multi_insert(XLogReaderState *record) int i; bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; XLogRedoAction action; + TransactionId pd_xid_base = InvalidTransactionId; + Pointer rec_data = (Pointer) XLogRecGetData(record); /* * Insertion doesn't overwrite MVCC data, so no conflict processing is * required. */ - xlrec = (xl_heap_multi_insert *) XLogRecGetData(record); + if (isinit) + { + pd_xid_base = *((TransactionId *) rec_data); + rec_data += sizeof(TransactionId); + } + xlrec = (xl_heap_multi_insert *) rec_data; XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); @@ -9260,7 +10118,18 @@ heap_xlog_multi_insert(XLogReaderState *record) { buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); - PageInit(page, BufferGetPageSize(buffer), 0); + + if (xlrec->flags & XLH_INSERT_ON_TOAST_RELATION) + { + PageInit(page, BufferGetPageSize(buffer), sizeof(ToastPageSpecialData)); + ToastPageGetSpecial(page)->pd_xid_base = pd_xid_base; + } + else + { + PageInit(page, BufferGetPageSize(buffer), sizeof(HeapPageSpecialData)); + HeapPageGetSpecial(page)->pd_xid_base = pd_xid_base; + } + action = BLK_NEEDS_REDO; } else @@ -9281,6 +10150,7 @@ heap_xlog_multi_insert(XLogReaderState *record) { OffsetNumber offnum; xl_multi_insert_tuple *xlhdr; + HeapTupleData tuple; /* * If we're reinitializing the page, the tuples are stored in @@ -9311,7 +10181,9 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask2 = xlhdr->t_infomask2; htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + tuple.t_data = htup; + HeapTupleSetXmin(&tuple, XLogRecGetXid(record)); + HeapTupleHeaderSetXmin(page, &tuple); HeapTupleHeaderSetCmin(htup, FirstCommandId); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -9359,8 +10231,8 @@ static void heap_xlog_update(XLogReaderState *record, bool hot_update) { XLogRecPtr lsn = record->EndRecPtr; - xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); RelFileLocator rlocator; + xl_heap_update *xlrec; BlockNumber oldblk; BlockNumber newblk; ItemPointerData newtid; @@ -9384,6 +10256,17 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) Size freespace = 0; XLogRedoAction oldaction; XLogRedoAction newaction; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + TransactionId pd_xid_base = InvalidTransactionId; + Pointer rec_data = (Pointer) XLogRecGetData(record); + + if (isinit) + { + pd_xid_base = *((TransactionId *) rec_data); + rec_data += sizeof(TransactionId); + } + + xlrec = (xl_heap_update *) rec_data; /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; @@ -9430,6 +10313,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) &obuffer); if (oldaction == BLK_NEEDS_REDO) { + HeapTupleData tuple; + page = BufferGetPage(obuffer); offnum = xlrec->old_offnum; if (PageGetMaxOffsetNumber(page) >= offnum) @@ -9442,6 +10327,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) oldtup.t_data = htup; oldtup.t_len = ItemIdGetLength(lp); + /* Toast tuples are never updated. */ + HeapTupleCopyBaseFromPage(obuffer, &oldtup, page, false); htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; @@ -9451,13 +10338,16 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) HeapTupleHeaderClearHotUpdated(htup); fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); - HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + tuple.t_data = htup; + HeapTupleSetXmax(&tuple, xlrec->old_xmax); + HeapTupleHeaderSetXmax(page, &tuple); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; /* Mark the page as a candidate for pruning */ - PageSetPrunable(page, XLogRecGetXid(record)); + /* Toast tuples are never updated. */ + PageSetPrunable(page, XLogRecGetXid(record), false); if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); @@ -9474,11 +10364,15 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) nbuffer = obuffer; newaction = oldaction; } - else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + else if (isinit) { nbuffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(nbuffer); - PageInit(page, BufferGetPageSize(nbuffer), 0); + + /* Toast tuples are never updated. */ + PageInit(page, BufferGetPageSize(nbuffer), sizeof(HeapPageSpecialData)); + + HeapPageGetSpecial(page)->pd_xid_base = pd_xid_base; newaction = BLK_NEEDS_REDO; } else @@ -9506,6 +10400,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) char *recdata_end; Size datalen; Size tuplen; + HeapTupleData tuple; recdata = XLogRecGetBlockData(record, 0, &datalen); recdata_end = recdata + datalen; @@ -9584,9 +10479,12 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; - HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + tuple.t_data = htup; + HeapTupleSetXmin(&tuple, XLogRecGetXid(record)); + HeapTupleHeaderSetXmin(page, &tuple); HeapTupleHeaderSetCmin(htup, FirstCommandId); - HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + HeapTupleSetXmax(&tuple, xlrec->new_xmax); + HeapTupleHeaderSetXmax(page, &tuple); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9697,6 +10595,8 @@ heap_xlog_lock(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + HeapTupleData tuple; + page = (Page) BufferGetPage(buffer); offnum = xlrec->offnum; @@ -9725,7 +10625,10 @@ heap_xlog_lock(XLogReaderState *record) BufferGetBlockNumber(buffer), offnum); } - HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); + + tuple.t_data = htup; + HeapTupleSetXmax(&tuple, xlrec->locking_xid); + HeapTupleHeaderSetXmax(page, &tuple); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -9770,6 +10673,8 @@ heap_xlog_lock_updated(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + HeapTupleData tuple; + page = BufferGetPage(buffer); offnum = xlrec->offnum; @@ -9785,7 +10690,9 @@ heap_xlog_lock_updated(XLogReaderState *record) htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); - HeapTupleHeaderSetXmax(htup, xlrec->xmax); + tuple.t_data = htup; + HeapTupleSetXmax(&tuple, xlrec->xmax); + HeapTupleHeaderSetXmax(page, &tuple); PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -9835,6 +10742,33 @@ heap_xlog_inplace(XLogReaderState *record) UnlockReleaseBuffer(buffer); } +static void +heap_xlog_base_shift(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_heap_base_shift *xlrec = (xl_heap_base_shift *) XLogRecGetData(record); + Buffer buffer; + Page page; + BlockNumber blkno; + RelFileLocator target_node; + + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + heap_page_shift_base(NULL, InvalidBuffer, page, xlrec->multi, + xlrec->delta, + xlrec->flags & XLH_BASE_SHIFT_ON_TOAST_RELATION); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + + void heap_redo(XLogReaderState *record) { @@ -9921,6 +10855,21 @@ heap2_redo(XLogReaderState *record) } } +void +heap3_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP3_BASE_SHIFT: + heap_xlog_base_shift(record); + break; + default: + elog(PANIC, "heap3_redo: unknown op code %u", info); + } +} + /* * Mask a heap page before performing consistency checks on it. */ @@ -9933,6 +10882,10 @@ heap_mask(char *pagedata, BlockNumber blkno) mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); + + /* Ignore prune_xid (it's like a hint-bit) */ + HeapPageSetPruneXid(page, InvalidTransactionId, false); + mask_unused_space(page); for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) @@ -10048,14 +11001,14 @@ HeapCheckForSerializableConflictOut(bool visible, Relation relation, case HEAPTUPLE_LIVE: if (visible) return; - xid = HeapTupleHeaderGetXmin(tuple->t_data); + xid = HeapTupleGetXmin(tuple); break; case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_DELETE_IN_PROGRESS: if (visible) - xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); + xid = HeapTupleGetUpdateXidAny(tuple); else - xid = HeapTupleHeaderGetXmin(tuple->t_data); + xid = HeapTupleGetXmin(tuple); if (TransactionIdPrecedes(xid, TransactionXmin)) { @@ -10065,7 +11018,7 @@ HeapCheckForSerializableConflictOut(bool visible, Relation relation, } break; case HEAPTUPLE_INSERT_IN_PROGRESS: - xid = HeapTupleHeaderGetXmin(tuple->t_data); + xid = HeapTupleGetXmin(tuple); break; case HEAPTUPLE_DEAD: Assert(!visible); @@ -10103,3 +11056,38 @@ HeapCheckForSerializableConflictOut(bool visible, Relation relation, CheckForSerializableConflictOut(relation, xid, snapshot); } + +/* + * Ensure that given xid fits base of given page. + */ +static bool +heap_page_prepare_for_xid(Relation relation, Buffer buffer, + TransactionId xid, bool multi) +{ + Page page = BufferGetPage(buffer); + int res; + + /* "Double xmax" page format doesn't require any preparation */ + if (HeapPageIsDoubleXmax(page)) + return false; + + if (!TransactionIdIsNormal(xid)) + return false; + + res = heap_page_try_prepare_for_xid(relation, buffer, page, xid, multi, + IsToastRelation(relation)); + if (res != -1) + return res == 1; + + /* Have to try freeing the page... */ + freeze_single_heap_page(relation, buffer); + + res = heap_page_try_prepare_for_xid(relation, buffer, page, xid, multi, + IsToastRelation(relation)); + if (res != -1) + return res == 1; + + elog(ERROR, "could not fit xid into page"); + + return false; +} diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 335abea67c..ffbfae09dd 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -413,7 +413,7 @@ tuple_lock_retry: * changes in an existing tuple, except to invalid or * frozen, and neither of those can match priorXmax.) */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + if (!TransactionIdEquals(HeapTupleGetXmin(tuple), priorXmax)) { ReleaseBuffer(buffer); @@ -473,7 +473,7 @@ tuple_lock_retry: * variable instead of doing HeapTupleHeaderGetXmin again. */ if (TransactionIdIsCurrentTransactionId(priorXmax) && - HeapTupleHeaderGetCmin(tuple->t_data) >= cid) + HeapTupleGetCmin(tuple) >= cid) { tmfd->xmax = priorXmax; @@ -481,7 +481,7 @@ tuple_lock_retry: * Cmin is the problematic value, so store that. See * above. */ - tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data); + tmfd->cmax = HeapTupleGetCmin(tuple); ReleaseBuffer(buffer); return TM_SelfModified; } @@ -507,7 +507,7 @@ tuple_lock_retry: /* * As above, if xmin isn't what we're expecting, do nothing. */ - if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data), + if (!TransactionIdEquals(HeapTupleGetXmin(tuple), priorXmax)) { ReleaseBuffer(buffer); @@ -538,7 +538,7 @@ tuple_lock_retry: /* updated, so look at the updated row */ *tid = tuple->t_data->t_ctid; /* updated row should have xmin matching this xmax */ - priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + priorXmax = HeapTupleGetUpdateXidAny(tuple); ReleaseBuffer(buffer); /* loop back to fetch next in chain */ } @@ -858,7 +858,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * case we had better copy it. */ if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + !TransactionIdIsCurrentTransactionId(HeapTupleGetXmin(tuple))) elog(WARNING, "concurrent insert in progress within table \"%s\"", RelationGetRelationName(OldHeap)); /* treat as live */ @@ -870,7 +870,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, * Similar situation to INSERT_IN_PROGRESS case. */ if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) + !TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXidAny(tuple))) elog(WARNING, "concurrent delete in progress within table \"%s\"", RelationGetRelationName(OldHeap)); /* treat as recently dead */ @@ -1055,6 +1055,8 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, targtuple->t_tableOid = RelationGetRelid(scan->rs_rd); targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); targtuple->t_len = ItemIdGetLength(itemid); + HeapTupleCopyBaseFromPage(hscan->rs_cbuf, targtuple, targpage, + IsToastRelation(scan->rs_rd)); switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin, hscan->rs_cbuf)) @@ -1090,7 +1092,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * numbers we report to the cumulative stats system to make * this come out right.) */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetXmin(targtuple))) { sample_it = true; *liverows += 1; @@ -1121,7 +1123,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * but not the post-image. We also get sane results if the * concurrent transaction never commits. */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXidAny(targtuple))) *deadrows += 1; else { @@ -1370,7 +1372,8 @@ heapam_index_build_range_scan(Relation heapRelation, Page page = BufferGetPage(hscan->rs_cbuf); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - heap_get_root_tuples(page, root_offsets); + heap_get_root_tuples(heapRelation, hscan->rs_cbuf, page, + root_offsets); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); root_blkno = hscan->rs_cblock; @@ -1463,7 +1466,7 @@ heapam_index_build_range_scan(Relation heapRelation, * before commit there. Give a warning if neither case * applies. */ - xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); + xwait = HeapTupleGetXmin(heapTuple); if (!TransactionIdIsCurrentTransactionId(xwait)) { if (!is_system_catalog) @@ -1522,7 +1525,7 @@ heapam_index_build_range_scan(Relation heapRelation, break; } - xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); + xwait = HeapTupleGetUpdateXidAny(heapTuple); if (!TransactionIdIsCurrentTransactionId(xwait)) { if (!is_system_catalog) @@ -1667,7 +1670,8 @@ heapam_index_build_range_scan(Relation heapRelation, Page page = BufferGetPage(hscan->rs_cbuf); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - heap_get_root_tuples(page, root_offsets); + heap_get_root_tuples(heapRelation, hscan->rs_cbuf, page, + root_offsets); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); } @@ -1833,7 +1837,8 @@ heapam_index_validate_scan(Relation heapRelation, Page page = BufferGetPage(hscan->rs_cbuf); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - heap_get_root_tuples(page, root_offsets); + heap_get_root_tuples(heapRelation, hscan->rs_cbuf, page, + root_offsets); LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); memset(in_index, 0, sizeof(in_index)); @@ -2200,13 +2205,15 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); loctup.t_tableOid = scan->rs_rd->rd_id; + HeapTupleCopyBaseFromPage(hscan->rs_cbuf, &loctup, dp, + IsToastRelation(scan->rs_rd)); ItemPointerSet(&loctup.t_self, page, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { hscan->rs_vistuples[ntup++] = offnum; PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot, - HeapTupleHeaderGetXmin(loctup.t_data)); + HeapTupleGetXmin(&loctup)); } HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer, snapshot); @@ -2221,6 +2228,13 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, return ntup > 0; } +static inline void +HeapTupleSetInvalid(HeapTuple tuple) +{ + tuple->t_xmin = InvalidTransactionId; + tuple->t_xmax = InvalidTransactionId; +} + static bool heapam_scan_bitmap_next_tuple(TableScanDesc scan, TBMIterateResult *tbmres, @@ -2245,6 +2259,7 @@ heapam_scan_bitmap_next_tuple(TableScanDesc scan, hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); hscan->rs_ctup.t_len = ItemIdGetLength(lp); hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; + HeapTupleSetInvalid(&hscan->rs_ctup); ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset); pgstat_count_heap_fetch(scan->rs_rd); @@ -2385,8 +2400,14 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate, tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + if (pagemode) + HeapTupleSetInvalid(tuple); + else + HeapTupleCopyBaseFromPage(hscan->rs_cbuf, tuple, page, + IsToastRelation(scan->rs_rd)); + + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); if (all_visible) visible = true; diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 6e33d1c881..370a8a67d8 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -217,7 +217,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) } } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; @@ -229,7 +229,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) { TransactionId xmax; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -241,7 +241,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; } - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { /* deleting subtransaction must have aborted */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -251,11 +251,11 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsInProgress(HeapTupleGetRawXmin(htup))) return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmin(htup))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + HeapTupleGetRawXmin(htup)); else { /* it must have aborted or crashed */ @@ -284,7 +284,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return true; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -299,17 +299,17 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return true; } - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return true; return false; } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) return true; - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdDidCommit(HeapTupleGetRawXmax(htup))) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -327,7 +327,7 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) } SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); return false; } @@ -416,7 +416,7 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, * is canceled by super-deleting the tuple. This also applies to * TOAST tuples created during speculative insertion. */ - else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) + else if (!TransactionIdIsValid(HeapTupleGetXmin(htup))) return false; } @@ -506,9 +506,9 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, } } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { - if (HeapTupleHeaderGetCmin(tuple) >= curcid) + if (HeapTupleGetCmin(htup) >= curcid) return TM_Invisible; /* inserted after scan started */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -518,7 +518,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, { TransactionId xmax; - xmax = HeapTupleHeaderGetRawXmax(tuple); + xmax = HeapTupleGetRawXmax(htup); /* * Careful here: even though this tuple was created by our own @@ -549,7 +549,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, { TransactionId xmax; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -557,21 +557,21 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, /* deleting subtransaction must have aborted */ if (!TransactionIdIsCurrentTransactionId(xmax)) { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + if (MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), false)) return TM_BeingModified; return TM_Ok; } else { - if (HeapTupleHeaderGetCmax(tuple) >= curcid) + if (HeapTupleGetCmax(htup) >= curcid) return TM_SelfModified; /* updated after scan started */ else return TM_Invisible; /* updated before scan started */ } } - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { /* deleting subtransaction must have aborted */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -579,16 +579,16 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return TM_Ok; } - if (HeapTupleHeaderGetCmax(tuple) >= curcid) + if (HeapTupleGetCmax(htup) >= curcid) return TM_SelfModified; /* updated after scan started */ else return TM_Invisible; /* updated before scan started */ } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsInProgress(HeapTupleGetRawXmin(htup))) return TM_Invisible; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmin(htup))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + HeapTupleGetRawXmin(htup)); else { /* it must have aborted or crashed */ @@ -622,17 +622,17 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), true)) + if (MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), true)) return TM_BeingModified; SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return TM_Ok; } - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); if (!TransactionIdIsValid(xmax)) { - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + if (MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), false)) return TM_BeingModified; } @@ -641,13 +641,13 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, if (TransactionIdIsCurrentTransactionId(xmax)) { - if (HeapTupleHeaderGetCmax(tuple) >= curcid) + if (HeapTupleGetCmax(htup) >= curcid) return TM_SelfModified; /* updated after scan started */ else return TM_Invisible; /* updated before scan started */ } - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + if (MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), false)) return TM_BeingModified; if (TransactionIdDidCommit(xmax)) @@ -663,7 +663,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, * what about the other members? */ - if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + if (!MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), false)) { /* * There's no member, even just a locker, alive anymore, so we can @@ -680,20 +680,20 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, } } - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return TM_BeingModified; - if (HeapTupleHeaderGetCmax(tuple) >= curcid) + if (HeapTupleGetCmax(htup) >= curcid) return TM_SelfModified; /* updated after scan started */ else return TM_Invisible; /* updated before scan started */ } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) return TM_BeingModified; - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdDidCommit(HeapTupleGetRawXmax(htup))) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -711,7 +711,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, } SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) return TM_Updated; /* updated by other */ else @@ -794,7 +794,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, } } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; @@ -806,7 +806,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, { TransactionId xmax; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -818,7 +818,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; } - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { /* deleting subtransaction must have aborted */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -828,7 +828,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsInProgress(HeapTupleGetRawXmin(htup))) { /* * Return the speculative token to caller. Caller can worry about @@ -844,13 +844,13 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Assert(snapshot->speculativeToken != 0); } - snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); + snapshot->xmin = HeapTupleGetRawXmin(htup); /* XXX shouldn't we fall through to look at xmax? */ return true; /* in insertion by other */ } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmin(htup))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + HeapTupleGetRawXmin(htup)); else { /* it must have aborted or crashed */ @@ -879,7 +879,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return true; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -897,21 +897,21 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return true; } - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return true; return false; } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) { if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); + snapshot->xmax = HeapTupleGetRawXmax(htup); return true; } - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdDidCommit(HeapTupleGetRawXmax(htup))) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -929,7 +929,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, } SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); return false; /* updated by other */ } @@ -1008,9 +1008,9 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, } } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { - if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) + if (HeapTupleGetCmin(htup) >= snapshot->curcid) return false; /* inserted after scan started */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ @@ -1023,7 +1023,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, { TransactionId xmax; - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -1031,13 +1031,13 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, /* updating subtransaction must have aborted */ if (!TransactionIdIsCurrentTransactionId(xmax)) return true; - else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + else if (HeapTupleGetCmax(htup) >= snapshot->curcid) return true; /* updated after scan started */ else return false; /* updated before scan started */ } - if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { /* deleting subtransaction must have aborted */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -1045,16 +1045,16 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return true; } - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + if (HeapTupleGetCmax(htup) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } - else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + else if (XidInMVCCSnapshot(HeapTupleGetRawXmin(htup), snapshot)) return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmin(htup))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + HeapTupleGetRawXmin(htup)); else { /* it must have aborted or crashed */ @@ -1067,7 +1067,7 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, { /* xmin is committed, but maybe not according to our snapshot */ if (!HeapTupleHeaderXminFrozen(tuple) && - XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) + XidInMVCCSnapshot(HeapTupleGetRawXmin(htup), snapshot)) return false; /* treat as still in progress */ } @@ -1086,14 +1086,14 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, /* already checked above */ Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); if (TransactionIdIsCurrentTransactionId(xmax)) { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + if (HeapTupleGetCmax(htup) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ @@ -1108,18 +1108,18 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { - if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) + if (HeapTupleGetCmax(htup) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + if (XidInMVCCSnapshot(HeapTupleGetRawXmax(htup), snapshot)) return true; - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdDidCommit(HeapTupleGetRawXmax(htup))) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, @@ -1129,12 +1129,12 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, /* xmax transaction committed */ SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); } else { /* xmax is committed, but maybe not according to our snapshot */ - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + if (XidInMVCCSnapshot(HeapTupleGetRawXmax(htup), snapshot)) return true; /* treat as still in progress */ } @@ -1249,21 +1249,21 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de return HEAPTUPLE_DEAD; } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; /* only locked? run infomask-only check first, for performance */ if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) || - HeapTupleHeaderIsOnlyLocked(tuple)) + HeapTupleIsOnlyLocked(htup)) return HEAPTUPLE_INSERT_IN_PROGRESS; /* inserted and then deleted by same xact */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXidAny(htup))) return HEAPTUPLE_DELETE_IN_PROGRESS; /* deleting subtransaction must have aborted */ return HEAPTUPLE_INSERT_IN_PROGRESS; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdIsInProgress(HeapTupleGetRawXmin(htup))) { /* * It'd be possible to discern between INSERT/DELETE in progress @@ -1275,9 +1275,9 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de */ return HEAPTUPLE_INSERT_IN_PROGRESS; } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmin(htup))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); + HeapTupleGetRawXmin(htup)); else { /* @@ -1319,14 +1319,14 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de * possibly be running; otherwise have to check. */ if (!HEAP_LOCKED_UPGRADED(tuple->t_infomask) && - MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), + MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), true)) return HEAPTUPLE_LIVE; SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); } else { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) return HEAPTUPLE_LIVE; SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -1344,7 +1344,7 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - TransactionId xmax = HeapTupleGetUpdateXid(tuple); + TransactionId xmax = HeapTupleGetUpdateXid(htup); /* already checked above */ Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); @@ -1367,7 +1367,7 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de *dead_after = xmax; return HEAPTUPLE_RECENTLY_DEAD; } - else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) + else if (!MultiXactIdIsRunning(HeapTupleGetRawXmax(htup), false)) { /* * Not in Progress, Not Committed, so either Aborted or crashed. @@ -1381,11 +1381,11 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + if (TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) return HEAPTUPLE_DELETE_IN_PROGRESS; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + else if (TransactionIdDidCommit(HeapTupleGetRawXmax(htup))) SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); else { /* @@ -1407,7 +1407,7 @@ HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *de * Deleter committed, allow caller to check if it was recent enough that * some open transactions could still see the tuple. */ - *dead_after = HeapTupleHeaderGetRawXmax(tuple); + *dead_after = HeapTupleGetRawXmax(htup); return HEAPTUPLE_RECENTLY_DEAD; } @@ -1503,7 +1503,7 @@ HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) /* Deleter committed, so tuple is dead if the XID is old enough. */ return GlobalVisTestIsRemovableXid(vistest, - HeapTupleHeaderGetRawXmax(tuple)); + HeapTupleGetRawXmax(htup)); } /* @@ -1516,8 +1516,9 @@ HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest) * at the top of this file. */ bool -HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) +HeapTupleIsOnlyLocked(HeapTuple htup) { + HeapTupleHeader tuple = htup->t_data; TransactionId xmax; /* if there's no valid Xmax, then there's obviously no update either */ @@ -1528,7 +1529,7 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) return true; /* invalid xmax means no update */ - if (!TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple))) + if (!TransactionIdIsValid(HeapTupleGetRawXmax(htup))) return true; /* @@ -1539,7 +1540,7 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) return false; /* ... but if it's a multi, then perhaps the updating Xid aborted. */ - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); @@ -1587,8 +1588,8 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; - TransactionId xmin = HeapTupleHeaderGetXmin(tuple); - TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); + TransactionId xmin = HeapTupleGetXmin(htup); + TransactionId xmax = HeapTupleGetRawXmax(htup); Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -1688,7 +1689,7 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, */ else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - xmax = HeapTupleGetUpdateXid(tuple); + xmax = HeapTupleGetUpdateXid(htup); } /* check if it's one of our txids, toplevel is also in there */ diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index 1575a81b01..252e57cc1d 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -307,6 +307,7 @@ heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, result_tuple->t_len = new_tuple_len; result_tuple->t_self = newtup->t_self; result_tuple->t_tableOid = newtup->t_tableOid; + HeapTupleCopyBase(result_tuple, newtup); new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE); result_tuple->t_data = new_data; @@ -395,6 +396,7 @@ toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc) */ new_tuple->t_self = tup->t_self; new_tuple->t_tableOid = tup->t_tableOid; + HeapTupleCopyBase(new_tuple, tup); new_tuple->t_data->t_choice = tup->t_data->t_choice; new_tuple->t_data->t_ctid = tup->t_data->t_ctid; @@ -467,6 +469,7 @@ toast_flatten_tuple_to_datum(HeapTupleHeader tup, ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; tmptup.t_data = tup; + HeapTupleSetZeroBase(&tmptup); /* * Break down the tuple into fields. diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index b0ece66629..49a91b727b 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -19,6 +19,7 @@ #include "access/hio.h" #include "access/htup_details.h" #include "access/visibilitymap.h" +#include "catalog/catalog.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -59,6 +60,19 @@ RelationPutHeapTuple(Relation relation, /* Add the tuple to the page */ pageHeader = BufferGetPage(buffer); + HeapTupleSetXmin(tuple, tuple->t_xmin); + if (IsToastRelation(relation)) + { + ToastTupleHeaderSetXmin(pageHeader, tuple); + ToastTupleHeaderSetXmax(pageHeader, tuple); + } + else + { + HeapTupleHeaderSetXmin(pageHeader, tuple); + HeapTupleHeaderSetXmax(pageHeader, tuple); + } + HeapTupleSetXmax(tuple, tuple->t_xmax); + offnum = PageAddItem(pageHeader, (Item) tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, true); @@ -243,7 +257,7 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate) /* we'll need this info below */ blockNum = BufferGetBlockNumber(buffer); - freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData; + freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData - MAXALIGN(sizeof(HeapPageSpecialData)); UnlockReleaseBuffer(buffer); @@ -514,6 +528,9 @@ loop: /* * Now we can check to see if there's enough free space here. If so, * we're done. + * + * "Double xmax" page is not suitable for any new tuple, since xmin + * can't be set there. */ page = BufferGetPage(buffer); @@ -525,12 +542,23 @@ loop: */ if (PageIsNew(page)) { - PageInit(page, BufferGetPageSize(buffer), 0); + if (IsToastRelation(relation)) + { + PageInit(page, BufferGetPageSize(buffer), sizeof(ToastPageSpecialData)); + ToastPageGetSpecial(page)->pd_xid_base = RecentXmin - FirstNormalTransactionId; + } + else + { + PageInit(page, BufferGetPageSize(buffer), sizeof(HeapPageSpecialData)); + HeapPageGetSpecial(page)->pd_xid_base = RecentXmin - FirstNormalTransactionId; + } + MarkBufferDirty(buffer); } pageFreeSpace = PageGetHeapFreeSpace(page); - if (targetFreeSpace <= pageFreeSpace) + if (targetFreeSpace <= pageFreeSpace && + !HeapPageIsDoubleXmax(page)) { /* use this page as future insert target, too */ RelationSetTargetBlock(relation, targetBlock); @@ -635,7 +663,17 @@ loop: BufferGetBlockNumber(buffer), RelationGetRelationName(relation)); - PageInit(page, BufferGetPageSize(buffer), 0); + if (IsToastRelation(relation)) + { + PageInit(page, BufferGetPageSize(buffer), sizeof(ToastPageSpecialData)); + ToastPageGetSpecial(page)->pd_xid_base = RecentXmin - FirstNormalTransactionId; + } + else + { + PageInit(page, BufferGetPageSize(buffer), sizeof(HeapPageSpecialData)); + HeapPageGetSpecial(page)->pd_xid_base = RecentXmin - FirstNormalTransactionId; + } + MarkBufferDirty(buffer); /* diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 9f43bbe25f..b1eff5a068 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -91,6 +91,17 @@ static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum); static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); static void page_verify_redirects(Page page); +static inline bool +XidFitsPage(Page page, TransactionId xid, bool is_toast) +{ + TransactionId base; + + base = is_toast ? ToastPageGetSpecial(page)->pd_xid_base : + HeapPageGetSpecial(page)->pd_xid_base; + + return xid >= base + FirstNormalTransactionId && + xid <= base + MaxShortTransactionId; +} /* * Optionally prune and repair fragmentation in the specified page. @@ -136,7 +147,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * determining the appropriate horizon is a waste if there's no prune_xid * (i.e. no updates/deletes left potentially dead tuples around). */ - prune_xid = ((PageHeader) page)->pd_prune_xid; + prune_xid = HeapPageGetPruneXidNoAssert(page, IsToastRelation(relation)); + if (!TransactionIdIsValid(prune_xid)) return; @@ -207,7 +219,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer) nnewlpdead; ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin, - limited_ts, &nnewlpdead, NULL); + limited_ts, &nnewlpdead, NULL, true); /* * Report the number of tuples reclaimed to pgstats. This is @@ -268,7 +280,8 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId old_snap_xmin, TimestampTz old_snap_ts, int *nnewlpdead, - OffsetNumber *off_loc) + OffsetNumber *off_loc, + bool repairFragmentation) { int ndeleted = 0; Page page = BufferGetPage(buffer); @@ -339,6 +352,8 @@ heap_page_prune(Relation relation, Buffer buffer, htup = (HeapTupleHeader) PageGetItem(page, itemid); tup.t_data = htup; tup.t_len = ItemIdGetLength(itemid); + HeapTupleCopyBaseFromPage(buffer, &tup, page, + IsToastRelation(relation)); ItemPointerSet(&(tup.t_self), blockno, offnum); /* @@ -393,13 +408,17 @@ heap_page_prune(Relation relation, Buffer buffer, heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, - prstate.nowunused, prstate.nunused); + prstate.nowunused, prstate.nunused, + repairFragmentation, + IsToastRelation(relation)); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ - ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + if (XidFitsPage(page, prstate.new_prune_xid, IsToastRelation(relation))) + HeapPageSetPruneXid(page, prstate.new_prune_xid, + IsToastRelation(relation)); /* * Also clear the "page is full" flag, since there's no point in @@ -421,6 +440,13 @@ heap_page_prune(Relation relation, Buffer buffer, xlrec.latestRemovedXid = prstate.latestRemovedXid; xlrec.nredirected = prstate.nredirected; xlrec.ndead = prstate.ndead; + xlrec.flags = 0; + + if (IsToastRelation(relation)) + xlrec.flags |= XLH_PRUNE_ON_TOAST_RELATION; + + if (repairFragmentation) + xlrec.flags |= XLH_PRUNE_REPAIR_FRAGMENTATION; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapPrune); @@ -461,10 +487,12 @@ heap_page_prune(Relation relation, Buffer buffer, * point in repeating the prune/defrag process until something else * happens to the page. */ - if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || + bool is_toast = IsToastRelation(relation); + + if (HeapPageGetPruneXid(page, is_toast) != prstate.new_prune_xid || PageIsFull(page)) { - ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; + HeapPageSetPruneXid(page, prstate.new_prune_xid, is_toast); PageClearFull(page); MarkBufferDirtyHint(buffer, true); } @@ -601,6 +629,9 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; + HeapTupleData tup; + + tup.t_tableOid = RelationGetRelid(prstate->rel); rootlp = PageGetItemId(dp, rootoffnum); @@ -612,6 +643,12 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) Assert(prstate->htsv[rootoffnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + tup.t_data = htup; + tup.t_len = ItemIdGetLength(rootlp); + ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum); + HeapTupleCopyBaseFromPage(buffer, &tup, dp, + IsToastRelation(prstate->rel)); + if (HeapTupleHeaderIsHeapOnly(htup)) { /* @@ -636,7 +673,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceLatestRemovedXid(htup, + HeapTupleHeaderAdvanceLatestRemovedXid(&tup, &prstate->latestRemovedXid); ndeleted++; } @@ -703,11 +740,17 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) Assert(prstate->htsv[offnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, lp); + tup.t_data = htup; + tup.t_len = ItemIdGetLength(lp); + HeapTupleCopyBaseFromPage(buffer, &tup, dp, + IsToastRelation(prstate->rel)); + ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); + /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + !TransactionIdEquals(HeapTupleGetXmin(&tup), priorXmax)) break; /* @@ -734,7 +777,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); + HeapTupleGetUpdateXidAny(&tup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: @@ -744,7 +787,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); + HeapTupleGetUpdateXidAny(&tup)); break; case HEAPTUPLE_LIVE: @@ -773,7 +816,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) if (tupdead) { latestdead = offnum; - HeapTupleHeaderAdvanceLatestRemovedXid(htup, + HeapTupleHeaderAdvanceLatestRemovedXid(&tup, &prstate->latestRemovedXid); } else if (!recent_dead) @@ -795,7 +838,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); + priorXmax = HeapTupleGetUpdateXidAny(&tup); } /* @@ -912,7 +955,9 @@ void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused) + OffsetNumber *nowunused, int nunused, + bool repairFragmentation, + bool is_toast) { Page page = (Page) BufferGetPage(buffer); OffsetNumber *offnum; @@ -1036,7 +1081,8 @@ heap_page_prune_execute(Buffer buffer, * Finally, repair any fragmentation, and update the page's hint bit about * whether it has free pointers. */ - PageRepairFragmentation(page); + if (repairFragmentation) + PageRepairFragmentation(page, is_toast); /* * Now that the page has been modified, assert that redirect items still @@ -1108,7 +1154,8 @@ page_verify_redirects(Page page) * and reused by a completely unrelated tuple. */ void -heap_get_root_tuples(Page page, OffsetNumber *root_offsets) +heap_get_root_tuples(Relation relation, Buffer buffer, Page page, + OffsetNumber *root_offsets) { OffsetNumber offnum, maxoff; @@ -1123,6 +1170,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) HeapTupleHeader htup; OffsetNumber nextoffnum; TransactionId priorXmax; + HeapTupleData tup; /* skip unused and dead items */ if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) @@ -1131,6 +1179,9 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) if (ItemIdIsNormal(lp)) { htup = (HeapTupleHeader) PageGetItem(page, lp); + tup.t_data = htup; + HeapTupleCopyBaseFromPage(buffer, &tup, page, + IsToastRelation(relation)); /* * Check if this tuple is part of a HOT-chain rooted at some other @@ -1152,7 +1203,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); + priorXmax = HeapTupleGetUpdateXidAny(&tup); } else { @@ -1191,9 +1242,12 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) break; htup = (HeapTupleHeader) PageGetItem(page, lp); + tup.t_data = htup; + HeapTupleCopyBaseFromPage(buffer, &tup, page, + IsToastRelation(relation)); if (TransactionIdIsValid(priorXmax) && - !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) + !TransactionIdEquals(priorXmax, HeapTupleGetXmin(&tup))) break; /* Remember the root line pointer for this item */ @@ -1207,7 +1261,7 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets) Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - priorXmax = HeapTupleHeaderGetUpdateXid(htup); + priorXmax = HeapTupleGetUpdateXidAny(&tup); } } } diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index b01b39b008..3060a7fdf8 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -380,6 +380,7 @@ rewrite_heap_tuple(RewriteState state, &old_tuple->t_data->t_choice.t_heap, sizeof(HeapTupleFields)); + HeapTupleCopyBase(new_tuple, old_tuple); new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; new_tuple->t_data->t_infomask |= @@ -389,7 +390,7 @@ rewrite_heap_tuple(RewriteState state, * While we have our hands on the tuple, we may as well freeze any * eligible xmin or xmax, so that future VACUUM effort can be saved. */ - heap_freeze_tuple(new_tuple->t_data, + heap_freeze_tuple(new_tuple, state->rs_old_rel->rd_rel->relfrozenxid, state->rs_old_rel->rd_rel->relminmxid, state->rs_freeze_xid, @@ -405,7 +406,7 @@ rewrite_heap_tuple(RewriteState state, * If the tuple has been updated, check the old-to-new mapping hash table. */ if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || - HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && + HeapTupleIsOnlyLocked(old_tuple)) && !HeapTupleHeaderIndicatesMovedPartitions(old_tuple->t_data) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) @@ -413,7 +414,7 @@ rewrite_heap_tuple(RewriteState state, OldToNewMapping mapping; memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); + hashkey.xmin = HeapTupleGetUpdateXidAny(old_tuple); hashkey.tid = old_tuple->t_data->t_ctid; mapping = (OldToNewMapping) @@ -486,7 +487,7 @@ rewrite_heap_tuple(RewriteState state, * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. */ if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), + !TransactionIdPrecedes(HeapTupleGetXmin(new_tuple), state->rs_oldest_xmin)) { /* @@ -495,7 +496,7 @@ rewrite_heap_tuple(RewriteState state, UnresolvedTup unresolved; memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + hashkey.xmin = HeapTupleGetXmin(new_tuple); hashkey.tid = old_tid; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, @@ -583,7 +584,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) bool found; memset(&hashkey, 0, sizeof(hashkey)); - hashkey.xmin = HeapTupleHeaderGetXmin(old_tuple->t_data); + hashkey.xmin = HeapTupleGetXmin(old_tuple); hashkey.tid = old_tuple->t_self; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, @@ -619,6 +620,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) Size len; OffsetNumber newoff; HeapTuple heaptup; + TransactionId xmin; + bool immutable_tuple; /* * If the new tuple is too big for storage or contains already toasted @@ -653,9 +656,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup) len = MAXALIGN(heaptup->t_len); /* be conservative */ /* - * If we're gonna fail for oversize tuple, do it right away + * Due to update to 64-xid maximum plain tuple size was decreased due to adding + * PageSpecial to a heap page. Pages with tuple that became too large to fit, + * should remain in Double Xmax format (read only). Inserting plain tuples with + * size over new MaxHeapTupleSizs is prohibited anyway, but vaccum full will + * transfer this page to a rebuild relation unmodified. */ - if (len > MaxHeapTupleSize) + immutable_tuple = len <= MaxHeapTupleSize_32 && len > MaxHeapTupleSize; + + /* + * If we're gonna fail for oversize tuple, do it right away. But allow to process + * immutable_tuple (see above). + */ + if (len > MaxHeapTupleSize && !immutable_tuple) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", @@ -704,10 +717,42 @@ raw_heap_insert(RewriteState state, HeapTuple tup) if (!state->rs_buffer_valid) { /* Initialize a new empty page */ - PageInit(page, BLCKSZ, 0); + if (immutable_tuple) + /* Initialize DoubleXmax page */ + PageInit(page, BLCKSZ, 0); + else + { + Size special_size; + + special_size = IsToastRelation(state->rs_new_rel) ? + sizeof(ToastPageSpecialData) : + sizeof(HeapPageSpecialData); + PageInit(page, BLCKSZ, special_size); + } state->rs_buffer_valid = true; } + rewrite_page_prepare_for_xid(page, heaptup, + IsToastRelation(state->rs_new_rel)); + + /* + * Tuple with HEAP_XMIN_FROZEN in t_infomask should have xmin set + * to FrozenTransactionId to avoid these tuples be treated like normal. + */ + xmin = HeapTupleGetXmin(heaptup); + HeapTupleSetXmin(heaptup, xmin); + + /* + * Tuples on DoubleXmax page could not appear modified after they had been + * frozen by pg_upgrade. Just check this to be safe. + */ + Assert(!immutable_tuple || xmin == FrozenTransactionId); + + if (!immutable_tuple) + HeapTupleHeaderSetXmin(page, heaptup); + + HeapTupleHeaderSetXmax(page, heaptup); + /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); @@ -988,19 +1033,24 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid, */ if (!found) { - char path[MAXPGPATH]; - Oid dboid; + char path[MAXPGPATH]; + Oid dboid; + TransactionId current_xid; if (state->rs_old_rel->rd_rel->relisshared) dboid = InvalidOid; else dboid = MyDatabaseId; + current_xid = GetCurrentTransactionId(); snprintf(path, MAXPGPATH, "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, dboid, relid, LSN_FORMAT_ARGS(state->rs_begin_lsn), - xid, GetCurrentTransactionId()); + (uint32) (xid >> 32), + (uint32) xid, + (uint32) (current_xid >> 32), + (uint32) current_xid); dlist_init(&src->mappings); src->num_mappings = 0; @@ -1049,9 +1099,9 @@ logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid, if (!state->rs_logical_rewrite) return; - xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); + xmin = HeapTupleGetXmin(new_tuple); /* use *GetUpdateXid to correctly deal with multixacts */ - xmax = HeapTupleHeaderGetUpdateXid(new_tuple->t_data); + xmax = HeapTupleGetUpdateXidAny(new_tuple); /* * Log the mapping iff the tuple has been created recently. @@ -1115,14 +1165,19 @@ heap_xlog_logical_rewrite(XLogReaderState *r) xl_heap_rewrite_mapping *xlrec; uint32 len; char *data; + TransactionId xid; xlrec = (xl_heap_rewrite_mapping *) XLogRecGetData(r); + xid = XLogRecGetXid(r); snprintf(path, MAXPGPATH, "pg_logical/mappings/" LOGICAL_REWRITE_FORMAT, xlrec->mapped_db, xlrec->mapped_rel, LSN_FORMAT_ARGS(xlrec->start_lsn), - xlrec->mapped_xid, XLogRecGetXid(r)); + (uint32) (xlrec->mapped_xid >> 32), + (uint32) xlrec->mapped_xid, + (uint32) (xid >> 32), + (uint32) xid); fd = OpenTransientFile(path, O_CREAT | O_WRONLY | PG_BINARY); @@ -1217,10 +1272,12 @@ CheckPointLogicalRewriteHeap(void) Oid dboid; Oid relid; XLogRecPtr lsn; - TransactionId rewrite_xid; - TransactionId create_xid; - uint32 hi, - lo; + uint32 lsn_hi, + lsn_lo, + rewrite_xid_hi, + rewrite_xid_lo, + create_xid_hi, + create_xid_lo; PGFileType de_type; if (strcmp(mapping_de->d_name, ".") == 0 || @@ -1238,10 +1295,12 @@ CheckPointLogicalRewriteHeap(void) continue; if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, - &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6) + &dboid, &relid, &lsn_hi, &lsn_lo, + &rewrite_xid_hi, &rewrite_xid_lo, + &create_xid_hi, &create_xid_lo) != 8) elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); - lsn = ((uint64) hi) << 32 | lo; + lsn = ((uint64) lsn_hi) << 32 | lsn_lo; if (lsn < cutoff || cutoff == InvalidXLogRecPtr) { diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 864876adf7..5ff535d2a8 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -46,6 +46,7 @@ #include "access/xlog.h" #include "access/xloginsert.h" #include "catalog/index.h" +#include "catalog/catalog.h" #include "catalog/storage.h" #include "commands/dbcommands.h" #include "commands/progress.h" @@ -267,7 +268,6 @@ static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, int index, Buffer *vmbuffer); -static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); static void lazy_cleanup_all_indexes(LVRelState *vacrel); static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, @@ -528,7 +528,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * ensure that parallel VACUUM won't be attempted at all when relfrozenxid * is already dangerously old.) */ - lazy_check_wraparound_failsafe(vacrel); dead_items_alloc(vacrel, params->nworkers); /* @@ -645,7 +644,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, WalUsage walusage; StringInfoData buf; char *msgfmt; - int32 diff; + int64 diff; int64 PageHitOp = VacuumPageHit - StartPageHit, PageMissOp = VacuumPageMiss - StartPageMiss, PageDirtyOp = VacuumPageDirty - StartPageDirty; @@ -698,32 +697,35 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, orig_rel_pages == 0 ? 100.0 : 100.0 * vacrel->scanned_pages / orig_rel_pages); appendStringInfo(&buf, - _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable\n"), + _("tuples: %lld removed, %lld remain, %lld are dead but not yet removable, oldest xmin: %llu\n"), (long long) vacrel->tuples_deleted, (long long) vacrel->new_rel_tuples, - (long long) vacrel->recently_dead_tuples); + (long long) vacrel->recently_dead_tuples, + (unsigned long long) OldestXmin); if (vacrel->missed_dead_tuples > 0) appendStringInfo(&buf, _("tuples missed: %lld dead from %u pages not removed due to cleanup lock contention\n"), (long long) vacrel->missed_dead_tuples, vacrel->missed_dead_pages); - diff = (int32) (ReadNextTransactionId() - OldestXmin); + diff = (int64) (ReadNextTransactionId() - OldestXmin); appendStringInfo(&buf, - _("removable cutoff: %llu, which was %d XIDs old when operation ended\n"), - (unsigned long long) OldestXmin, diff); + _("removable cutoff: %llu, which was %lld XIDs old when operation ended\n"), + (unsigned long long) OldestXmin, (long long) diff); if (frozenxid_updated) { - diff = (int32) (vacrel->NewRelfrozenXid - vacrel->relfrozenxid); + diff = (int64) (vacrel->NewRelfrozenXid - vacrel->relfrozenxid); appendStringInfo(&buf, - _("new relfrozenxid: %llu, which is %d XIDs ahead of previous value\n"), - (unsigned long long) vacrel->NewRelfrozenXid, diff); + _("new relfrozenxid: %llu, which is %lld XIDs ahead of previous value\n"), + (unsigned long long) vacrel->NewRelfrozenXid, + (long long) diff); } if (minmulti_updated) { - diff = (int32) (vacrel->NewRelminMxid - vacrel->relminmxid); + diff = (int64) (vacrel->NewRelminMxid - vacrel->relminmxid); appendStringInfo(&buf, - _("new relminmxid: %llu, which is %d MXIDs ahead of previous value\n"), - (unsigned long long) vacrel->NewRelminMxid, diff); + _("new relminmxid: %llu, which is %lld MXIDs ahead of previous value\n"), + (unsigned long long) vacrel->NewRelminMxid, + (long long) diff); } appendStringInfo(&buf, _("frozen: %u pages from table (%.2f%% of total) had %lld tuples frozen\n"), vacrel->frozen_pages, @@ -932,7 +934,6 @@ lazy_scan_heap(LVRelState *vacrel) */ if (blkno - next_failsafe_block >= FAILSAFE_EVERY_PAGES) { - lazy_check_wraparound_failsafe(vacrel); next_failsafe_block = blkno; } @@ -1452,7 +1453,14 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) { - freespace = BLCKSZ - SizeOfPageHeaderData; + Size special_size; + + special_size = IsToastRelation(vacrel->rel) ? + sizeof(ToastPageSpecialData) : + sizeof(HeapPageSpecialData); + freespace = BufferGetPageSize(buf) + - SizeOfPageHeaderData + - special_size; RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); } @@ -1556,6 +1564,7 @@ lazy_scan_prune(LVRelState *vacrel, maxoff; ItemId itemid; HeapTupleData tuple; + HeapTupleHeader htup; HTSV_Result res; int tuples_deleted, tuples_frozen, @@ -1599,7 +1608,7 @@ retry: */ tuples_deleted = heap_page_prune(rel, buf, vacrel->vistest, InvalidTransactionId, 0, &nnewlpdead, - &vacrel->offnum); + &vacrel->offnum, true); /* * Now scan the page to collect LP_DEAD items and check for tuples @@ -1664,6 +1673,7 @@ retry: tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); + HeapTupleCopyBaseFromPage(buf, &tuple, page, IsToastRelation(rel)); /* * DEAD tuples are almost always pruned into LP_DEAD line pointers by @@ -1727,7 +1737,7 @@ retry: * The inserter definitely committed. But is it old enough * that everyone sees it as committed? */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); + xmin = HeapTupleGetXmin(&tuple); if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin)) { prunestate->all_visible = false; @@ -1783,7 +1793,7 @@ retry: * now. */ prunestate->hastup = true; /* page makes rel truncation unsafe */ - if (heap_prepare_freeze_tuple(tuple.t_data, + if (heap_prepare_freeze_tuple(&tuple, vacrel->relfrozenxid, vacrel->relminmxid, vacrel->FreezeLimit, @@ -1840,12 +1850,10 @@ retry: /* execute collected freezes */ for (int i = 0; i < tuples_frozen; i++) { - HeapTupleHeader htup; - itemid = PageGetItemId(page, frozen[i].offset); htup = (HeapTupleHeader) PageGetItem(page, itemid); - - heap_execute_freeze_tuple(htup, &frozen[i]); + heap_execute_freeze_tuple_page(page, htup, &frozen[i], + IsToastRelation(vacrel->rel)); } /* Now WAL-log freezing if necessary */ @@ -1965,7 +1973,6 @@ lazy_scan_noprune(LVRelState *vacrel, live_tuples, recently_dead_tuples, missed_dead_tuples; - HeapTupleHeader tupleheader; TransactionId NewRelfrozenXid = vacrel->NewRelfrozenXid; MultiXactId NewRelminMxid = vacrel->NewRelminMxid; OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; @@ -2011,8 +2018,14 @@ lazy_scan_noprune(LVRelState *vacrel, } *hastup = true; /* page prevents rel truncation */ - tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); - if (heap_tuple_would_freeze(tupleheader, + ItemPointerSet(&(tuple.t_self), blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(vacrel->rel); + HeapTupleCopyBaseFromPage(buf, &tuple, page, + IsToastRelation(vacrel->rel)); + + if (heap_tuple_would_freeze(&tuple, vacrel->FreezeLimit, vacrel->MultiXactCutoff, &NewRelfrozenXid, &NewRelminMxid)) @@ -2045,11 +2058,6 @@ lazy_scan_noprune(LVRelState *vacrel, */ } - ItemPointerSet(&(tuple.t_self), blkno, offnum); - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(vacrel->rel); - switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf)) { case HEAPTUPLE_DELETE_IN_PROGRESS: @@ -2319,13 +2327,6 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); - /* Precheck for XID wraparound emergencies */ - if (lazy_check_wraparound_failsafe(vacrel)) - { - /* Wraparound emergency -- don't even start an index scan */ - return false; - } - /* Report that we are now vacuuming indexes */ pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_PHASE_VACUUM_INDEX); @@ -2340,13 +2341,6 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) vacrel->indstats[idx] = lazy_vacuum_one_index(indrel, istat, vacrel->old_live_tuples, vacrel); - - if (lazy_check_wraparound_failsafe(vacrel)) - { - /* Wraparound emergency -- end current index scan */ - allindexes = false; - break; - } } } else @@ -2354,13 +2348,6 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) /* Outsource everything to parallel variant */ parallel_vacuum_bulkdel_all_indexes(vacrel->pvs, vacrel->old_live_tuples, vacrel->num_index_scans); - - /* - * Do a postcheck to consider applying wraparound failsafe now. Note - * that parallel VACUUM only gets the precheck and this postcheck. - */ - if (lazy_check_wraparound_failsafe(vacrel)) - allindexes = false; } /* @@ -2606,58 +2593,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, return index; } -/* - * Trigger the failsafe to avoid wraparound failure when vacrel table has a - * relfrozenxid and/or relminmxid that is dangerously far in the past. - * Triggering the failsafe makes the ongoing VACUUM bypass any further index - * vacuuming and heap vacuuming. Truncating the heap is also bypassed. - * - * Any remaining work (work that VACUUM cannot just bypass) is typically sped - * up when the failsafe triggers. VACUUM stops applying any cost-based delay - * that it started out with. - * - * Returns true when failsafe has been triggered. - */ -static bool -lazy_check_wraparound_failsafe(LVRelState *vacrel) -{ - Assert(TransactionIdIsNormal(vacrel->relfrozenxid)); - Assert(MultiXactIdIsValid(vacrel->relminmxid)); - - /* Don't warn more than once per VACUUM */ - if (vacrel->failsafe_active) - return true; - - if (unlikely(vacuum_xid_failsafe_check(vacrel->relfrozenxid, - vacrel->relminmxid))) - { - vacrel->failsafe_active = true; - - /* Disable index vacuuming, index cleanup, and heap rel truncation */ - vacrel->do_index_vacuuming = false; - vacrel->do_index_cleanup = false; - vacrel->do_rel_truncate = false; - - ereport(WARNING, - (errmsg("bypassing nonessential maintenance of table \"%s.%s.%s\" as a failsafe after %d index scans", - get_database_name(MyDatabaseId), - vacrel->relnamespace, - vacrel->relname, - vacrel->num_index_scans), - errdetail("The table's relfrozenxid or relminmxid is too far in the past."), - errhint("Consider increasing configuration parameter \"maintenance_work_mem\" or \"autovacuum_work_mem\".\n" - "You might also need to consider other ways for VACUUM to keep up with the allocation of transaction IDs."))); - - /* Stop applying cost limits from this point on */ - VacuumCostActive = false; - VacuumCostBalance = 0; - - return true; - } - - return false; -} - /* * lazy_cleanup_all_indexes() -- cleanup all indexes of relation. */ @@ -3285,7 +3220,8 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(vacrel->rel); - + HeapTupleCopyBaseFromPage(buf, &tuple, page, + IsToastRelation(vacrel->rel)); switch (HeapTupleSatisfiesVacuum(&tuple, vacrel->OldestXmin, buf)) { case HEAPTUPLE_LIVE: @@ -3304,7 +3240,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, * The inserter definitely committed. But is it old enough * that everyone sees it as committed? */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); + xmin = HeapTupleGetXmin(&tuple); if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin)) { all_visible = false; @@ -3318,7 +3254,7 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, /* Check whether this tuple is already frozen or not */ if (all_visible && *all_frozen && - heap_tuple_needs_eventual_freeze(tuple.t_data)) + heap_tuple_needs_eventual_freeze(&tuple)) *all_frozen = false; } break; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 8b96708b3e..6c4b1f0f50 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -546,6 +546,7 @@ _bt_getroot(Relation rel, int access) rootblkno = rootopaque->btpo_next; } + /* Note: can't check btpo_level on deleted pages */ if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), @@ -649,6 +650,7 @@ _bt_gettrueroot(Relation rel) rootblkno = rootopaque->btpo_next; } + /* Note: can't check btpo_level on deleted pages */ if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 241e26d338..c712ee645f 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -140,6 +140,7 @@ _bt_findsplitloc(Relation rel, olddataitemstoleft, perfectpenalty, leaffillfactor; + int maxTupleEnd PG_USED_FOR_ASSERTS_ONLY; FindSplitData state; FindSplitStrat strategy; ItemId itemid; @@ -153,6 +154,7 @@ _bt_findsplitloc(Relation rel, opaque = BTPageGetOpaque(origpage); maxoff = PageGetMaxOffsetNumber(origpage); + maxTupleEnd = ItemIdGetTupleEnd(PageGetItemId(origpage, P_HIKEY)); /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = @@ -214,6 +216,18 @@ _bt_findsplitloc(Relation rel, itemid = PageGetItemId(origpage, offnum); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); +#ifdef USE_ASSERT_CHECKING + + /* + * Ending of rightmost tuple on a page can be shifted relative to left + * boundary of BTPageOpaqueData due to conversion from EE96, which + * used different BTPageOpaqueData layout. It is only checked in the + * assert below. + */ + if (maxTupleEnd < ItemIdGetTupleEnd(itemid)) + maxTupleEnd = ItemIdGetTupleEnd(itemid); +#endif + /* * When item offset number is not newitemoff, neither side of the * split can be newitem. Record a split after the previous data item @@ -248,7 +262,7 @@ _bt_findsplitloc(Relation rel, * (Though only when it's possible that newitem will end up alone on new * right page.) */ - Assert(olddataitemstoleft == olddataitemstotal); + Assert(olddataitemstoleft + ((PageHeader) origpage)->pd_special - maxTupleEnd == olddataitemstotal); if (newitemoff > maxoff) _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index ad489e33b3..2c33033441 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -15,6 +15,8 @@ #include "postgres.h" #include "access/bufmask.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/transam.h" diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index ad855894be..818c8c27ee 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -29,7 +29,7 @@ out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec) appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %llu", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, xlrec->block, - (unsigned long long) U64FromFullTransactionId(xlrec->latestRemovedFullXid)); + (unsigned long long) XidFromFullTransactionId(xlrec->latestRemovedFullXid)); } static void @@ -51,7 +51,7 @@ static void out_gistxlogPageDelete(StringInfo buf, gistxlogPageDelete *xlrec) { appendStringInfo(buf, "deleteXid %llu; downlink %u", - (unsigned long long) U64FromFullTransactionId(xlrec->deleteXid), + (unsigned long long) XidFromFullTransactionId(xlrec->deleteXid), xlrec->downlinkOffset); } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 503808cf0a..407987d373 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -182,6 +182,23 @@ heap2_desc(StringInfo buf, XLogReaderState *record) } } +void +heap3_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_HEAP_OPMASK; + if (info == XLOG_HEAP3_BASE_SHIFT) + { + xl_heap_base_shift *xlrec = (xl_heap_base_shift *) rec; + + appendStringInfo(buf, "%s delta %lld ", + xlrec->multi ? "MultiXactId" : "XactId", + (long long) xlrec->delta); + } +} + const char * heap_identify(uint8 info) { @@ -265,3 +282,18 @@ heap2_identify(uint8 info) return id; } + +const char * +heap3_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HEAP3_BASE_SHIFT: + id = "BASE_SHIFT"; + break; + } + + return id; +} diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 950f9269f7..b9c1826770 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,9 +65,9 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%llu offset %u nmembers %d: ", + appendStringInfo(buf, "%llu offset %llu nmembers %d: ", (unsigned long long) xlrec->mid, - xlrec->moff, xlrec->nmembers); + (unsigned long long) xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); } @@ -75,10 +75,11 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%llu, %llu), members [%u, %u)", + appendStringInfo(buf, "offsets [%llu, %llu), members [%llu, %llu)", (unsigned long long) xlrec->startTruncOff, (unsigned long long) xlrec->endTruncOff, - xlrec->startTruncMemb, xlrec->endTruncMemb); + (unsigned long long) xlrec->startTruncMemb, + (unsigned long long) xlrec->endTruncMemb); } } diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index bf25c941e4..c739b65942 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -83,7 +83,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "left %u; right %u; level %u; safexid %llu; ", xlrec->leftsib, xlrec->rightsib, xlrec->level, - (unsigned long long) U64FromFullTransactionId(xlrec->safexid)); + (unsigned long long) XidFromFullTransactionId(xlrec->safexid)); appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u", xlrec->leafleftsib, xlrec->leafrightsib, xlrec->leaftopparent); @@ -103,7 +103,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %llu", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, - (unsigned long long) U64FromFullTransactionId(xlrec->latestRemovedFullXid)); + (unsigned long long) XidFromFullTransactionId(xlrec->latestRemovedFullXid)); break; } case XLOG_BTREE_META_CLEANUP: diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index b930943bb0..f97996d8e7 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -110,7 +110,8 @@ ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *pars { xl_xact_twophase *xl_twophase = (xl_xact_twophase *) data; - parsed->twophase_xid = xl_twophase->xid; + parsed->twophase_xid = + ((uint64) xl_twophase->xid_hi << 32) | xl_twophase->xid_lo; data += sizeof(xl_xact_twophase); @@ -205,7 +206,8 @@ ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed) { xl_xact_twophase *xl_twophase = (xl_xact_twophase *) data; - parsed->twophase_xid = xl_twophase->xid; + parsed->twophase_xid = + ((uint64) xl_twophase->xid_hi << 32) | xl_twophase->xid_lo; data += sizeof(xl_xact_twophase); diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 647d64dc9a..4c70c125b2 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -45,7 +45,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; xid %llu; oid %u; multi %llu; offset %u; " + "tli %u; prev tli %u; fpw %s; xid %llu; oid %u; multi %llu; offset %llu; " "oldest xid %llu in DB %u; oldest multi %llu in DB %u; " "oldest/newest commit timestamp xid: %llu/%llu; " "oldest running xid %llu; %s", @@ -53,10 +53,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->ThisTimeLineID, checkpoint->PrevTimeLineID, checkpoint->fullPageWrites ? "true" : "false", - (unsigned long long) U64FromFullTransactionId(checkpoint->nextXid), + (unsigned long long) XidFromFullTransactionId(checkpoint->nextXid), checkpoint->nextOid, (unsigned long long) checkpoint->nextMulti, - checkpoint->nextMultiOffset, + (unsigned long long) checkpoint->nextMultiOffset, (unsigned long long) checkpoint->oldestXid, checkpoint->oldestXidDB, (unsigned long long) checkpoint->oldestMulti, diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index afbb5d6b11..9ae0ccbd56 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -294,7 +294,7 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, * sub-XIDs and all of the XIDs for which we're adjusting clog should be * on the same page. Check those conditions, too. */ - if (all_xact_same_page && xid == MyProc->xid && + if (all_xact_same_page && xid == pg_atomic_read_u64(&MyProc->xid) && nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && nsubxids == MyProc->subxidStatus.count && (nsubxids == 0 || @@ -713,7 +713,7 @@ void BootStrapCLOG(void) { int slotno; - int pageno; + int64 pageno; LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); @@ -727,7 +727,10 @@ BootStrapCLOG(void) pageno = TransactionIdToPage(XidFromFullTransactionId(ShmemVariableCache->nextXid)); if (pageno != 0) { + /* Create and zero the first page of the commit log */ slotno = ZeroCLOGPage(pageno, false); + + /* Make sure it's written out */ SimpleLruWritePage(XactCtl, slotno); Assert(!XactCtl->shared->page_dirty[slotno]); } @@ -921,24 +924,11 @@ TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) SimpleLruTruncate(XactCtl, cutoffPage); } - /* * Decide whether a CLOG page number is "older" for truncation purposes. * - * We need to use comparison of TransactionIds here in order to do the right - * thing with wraparound XID arithmetic. However, TransactionIdPrecedes() - * would get weird about permanent xact IDs. So, offset both such that xid1, - * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset - * is relevant to page 0 and to the page preceding page 0. - * - * The page containing oldestXact-2^31 is the important edge case. The - * portion of that page equaling or following oldestXact-2^31 is expendable, - * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is - * the first XID of a page and segment, the entire page and segment is - * expendable, and we could truncate the segment. Recognizing that case would - * require making oldestXact, not just the page containing oldestXact, - * available to this callback. The benefit would be rare and small, so we - * don't optimize that edge case. + * With 64xid this function is just "<", but we left it as a function in order + * for its calls remain "vanilla" like. */ static bool CLOGPagePrecedes(int64 page1, int64 page2) diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index b91097ce0e..7ca59025a7 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -916,25 +916,6 @@ AdvanceOldestCommitTsXid(TransactionId oldestXact) /* * Decide whether a commitTS page number is "older" for truncation purposes. * Analogous to CLOGPagePrecedes(). - * - * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This - * introduces differences compared to CLOG and the other SLRUs having (1 << - * 31) % per_page == 0. This function never tests exactly - * TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit, - * there are two possible counts of page boundaries between oldestXact and the - * latest XID assigned, depending on whether oldestXact is within the first - * 128 entries of its page. Since this function doesn't know the location of - * oldestXact within page2, it returns false for one page that actually is - * expendable. This is a wider (yet still negligible) version of the - * truncation opportunity that CLOGPagePrecedes() cannot recognize. - * - * For the sake of a worked example, number entries with decimal values such - * that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of - * pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1, - * then the final safe XID assignment leaves newestXact=1.95. We keep page 2, - * because entry=2.85 is the border that toggles whether entries precede the - * last entry of the oldestXact page. While page 2 is expendable at - * oldestXact=N+2.1, it would be precious at oldestXact=N+2.9. */ static bool CommitTsPagePrecedes(int64 page1, int64 page2) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 35b90229a2..e9a6964ef2 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -112,15 +112,15 @@ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) #define MultiXactIdToOffsetEntry(xid) \ ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) -#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) +#define MultiXactIdToOffsetSegment(xid) ((uint64)(MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)) /* * The situation for members is a bit more complex: we store one byte of * additional flag bits for each TransactionId. To do this without getting - * into alignment issues, we store four bytes of flags, and then the - * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups - * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * into alignment issues, we store eight bytes of flags, and then the + * corresponding 8 Xids. Each such 9-word (72-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 113 groups + * per page. This wastes 56 bytes per page, but that's OK -- simplicity (and * performance) trumps space efficiency here. * * Note that the "offset" macros work with byte offset, not array indexes, so @@ -132,7 +132,7 @@ #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) /* how many full bytes of flags are there in a group? */ -#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_FLAGBYTES_PER_GROUP 8 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) /* size in bytes of a complete group */ @@ -142,22 +142,9 @@ #define MULTIXACT_MEMBERS_PER_PAGE \ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) -/* - * Because the number of items per page is not a divisor of the last item - * number (member 0xFFFFFFFF), the last segment does not use the maximum number - * of pages, and moreover the last used page therein does not use the same - * number of items as previous pages. (Another way to say it is that the - * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page - * has some empty space after that item.) - * - * This constant is the number of members in the last page of the last segment. - */ -#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ - ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) - /* page in which a member is to be found */ #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) -#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) +#define MXOffsetToMemberSegment(xid) ((uint64)(MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)) /* Location (byte offset within page) of flag word for a given member */ #define MXOffsetToFlagsOffset(xid) \ @@ -216,22 +203,8 @@ typedef struct MultiXactStateData MultiXactId oldestMultiXactId; Oid oldestMultiXactDB; - /* - * Oldest multixact offset that is potentially referenced by a multixact - * referenced by a relation. We don't always know this value, so there's - * a flag here to indicate whether or not we currently do. - */ - MultiXactOffset oldestOffset; - bool oldestOffsetKnown; - /* support for anti-wraparound measures */ MultiXactId multiVacLimit; - MultiXactId multiWarnLimit; - MultiXactId multiStopLimit; - MultiXactId multiWrapLimit; - - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ /* * Per-backend data starts here. We have two arrays stored in the area @@ -361,9 +334,6 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, @@ -397,6 +367,9 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, /* MultiXactIdSetOldestMember() must have been called already. */ Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + /* memset members array because with 64-bit xids it has a padding hole */ + MemSet(members, 0, sizeof(members)); + /* * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs * are still running. In typical usage, xid2 will be our own XID and the @@ -512,7 +485,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) * end of the loop. */ newMembers = (MultiXactMember *) - palloc(sizeof(MultiXactMember) * (nmembers + 1)); + palloc0(sizeof(MultiXactMember) * (nmembers + 1)); for (i = 0, j = 0; i < nmembers; i++) { @@ -527,7 +500,6 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) newMembers[j].xid = xid; newMembers[j++].status = status; - newMulti = MultiXactIdCreateFromMembers(j, newMembers); pfree(members); @@ -903,8 +875,8 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, for (i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; - uint32 *flagsptr; - uint32 flagsval; + uint64 *flagsptr; + uint64 flagsval; int bshift; int flagsoff; int memberoff; @@ -927,12 +899,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, *memberptr = members[i].xid; - flagsptr = (uint32 *) + flagsptr = (uint64 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); flagsval = *flagsptr; - flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); - flagsval |= (members[i].status << bshift); + flagsval &= ~((uint64) ((1ULL << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= ((uint64) members[i].status << bshift); *flagsptr = flagsval; MultiXactMemberCtl->shared->page_dirty[slotno] = true; @@ -985,8 +957,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * If we're past multiVacLimit or the safe threshold for member storage * space, or we don't know what the safe threshold for member storage is, * start trying to force autovacuum cycles. - * If we're past multiWarnLimit, start issuing warnings. - * If we're past multiStopLimit, refuse to create new MultiXactIds. * * Note these are pretty much the same protections in GetNewTransactionId. *---------- @@ -1000,41 +970,9 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * possibility of deadlock while doing get_database_name(). First, * copy all the shared values we'll need in this path. */ - MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; - MultiXactId multiStopLimit = MultiXactState->multiStopLimit; - MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; - Oid oldest_datoid = MultiXactState->oldestMultiXactDB; LWLockRelease(MultiXactGenLock); - if (IsUnderPostmaster && - !MultiXactIdPrecedes(result, multiStopLimit)) - { - char *oldest_datname = get_database_name(oldest_datoid); - - /* - * Immediately kick autovacuum into action as we're already in - * ERROR territory. - */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - /* complain even if that DB has disappeared */ - if (oldest_datname) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"", - oldest_datname), - errhint("Execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u", - oldest_datoid), - errhint("Execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } - /* * To avoid swamping the postmaster with signals, we issue the autovac * request only once per 64K multis generated. This still gives @@ -1043,31 +981,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) if (IsUnderPostmaster && (result % 65536) == 0) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - if (!MultiXactIdPrecedes(result, multiWarnLimit)) - { - char *oldest_datname = get_database_name(oldest_datoid); - - /* complain even if that DB has disappeared */ - if (oldest_datname) - ereport(WARNING, - (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", - "database \"%s\" must be vacuumed before %u more MultiXactIds are used", - multiWrapLimit - result, - oldest_datname, - multiWrapLimit - result), - errhint("Execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(WARNING, - (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", - "database with OID %u must be vacuumed before %u more MultiXactIds are used", - multiWrapLimit - result, - oldest_datoid, - multiWrapLimit - result), - errhint("Execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } - /* Re-acquire lock and start over */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); result = MultiXactState->nextMXact; @@ -1092,78 +1005,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) else *offset = nextOffset; - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.", - MultiXactState->oldestMultiXactDB))); - } - - /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. - */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings."))); - ExtendMultiXactMember(nextOffset, nmembers); /* @@ -1192,8 +1033,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %llu offset %u", - (unsigned long long) result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %llu offset %llu", + (unsigned long long) result, (unsigned long long) *offset); return result; } @@ -1303,14 +1144,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, if (MultiXactIdPrecedes(multi, oldestMXact)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("MultiXactId %u does no longer exist -- apparent wraparound", - multi))); + errmsg("MultiXactId %llu does no longer exist -- apparent wraparound", + (unsigned long long) multi))); if (!MultiXactIdPrecedes(multi, nextMXact)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("MultiXactId %u has not been created yet -- apparent wraparound", - multi))); + errmsg("MultiXactId %llu has not been created yet -- apparent wraparound", + (unsigned long long) multi))); /* * Find out the offset at which we need to start reading MultiXactMembers @@ -1356,7 +1197,10 @@ retry: offptr += entryno; offset = *offptr; - Assert(offset != 0); + if (offset == 0) + ereport(ERROR, + (errmsg("found invalid zero offset in multixact %llu", + (unsigned long long) multi))); /* * Use the same increment rule as GetNewMultiXactId(), that is, don't @@ -1403,7 +1247,7 @@ retry: LWLockRelease(MultiXactOffsetSLRULock); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); + ptr = (MultiXactMember *) palloc0(length * sizeof(MultiXactMember)); /* Now get the members themselves. */ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); @@ -1413,7 +1257,7 @@ retry: for (i = 0; i < length; i++, offset++) { TransactionId *xactptr; - uint32 *flagsptr; + uint64 *flagsptr; int flagsoff; int bshift; int memberoff; @@ -1439,7 +1283,7 @@ retry: flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); - flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + flagsptr = (uint64 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); ptr[truelength].xid = *xactptr; ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; @@ -1903,7 +1747,7 @@ void BootStrapMultiXact(void) { int slotno; - int pageno; + int64 pageno; LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); @@ -2224,8 +2068,9 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %llu offset %u", - (unsigned long long) nextMulti, nextMultiOffset); + debug_elog4(DEBUG2, "MultiXact: setting next multi to %llu offset %llu", + (unsigned long long) nextMulti, + (unsigned long long) nextMultiOffset); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; @@ -2259,47 +2104,9 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, bool is_startup) { MultiXactId multiVacLimit; - MultiXactId multiWarnLimit; - MultiXactId multiStopLimit; - MultiXactId multiWrapLimit; - MultiXactId curMulti; - bool needs_offset_vacuum; Assert(MultiXactIdIsValid(oldest_datminmxid)); - /* - * We pretend that a wrap will happen halfway through the multixact ID - * space, but that's not really true, because multixacts wrap differently - * from transaction IDs. Note that, separately from any concern about - * multixact IDs wrapping, we must ensure that multixact members do not - * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. - */ - multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); - if (multiWrapLimit < FirstMultiXactId) - multiWrapLimit += FirstMultiXactId; - - /* - * We'll refuse to continue assigning MultiXactIds once we get within 3M - * multi of data loss. See SetTransactionIdLimit. - */ - multiStopLimit = multiWrapLimit - 3000000; - if (multiStopLimit < FirstMultiXactId) - multiStopLimit -= FirstMultiXactId; - - /* - * We'll start complaining loudly when we get within 40M multis of data - * loss. This is kind of arbitrary, but if you let your gas gauge get - * down to 2% of full, would you be looking for the next gas station? We - * need to be fairly liberal about this number because there are lots of - * scenarios where most transactions are done by automatic clients that - * won't pay attention to warnings. (No, we're not gonna make this - * configurable. If you know enough to configure it, you know enough to - * not get in this kind of trouble in the first place.) - */ - multiWarnLimit = multiWrapLimit - 40000000; - if (multiWarnLimit < FirstMultiXactId) - multiWarnLimit -= FirstMultiXactId; - /* * We'll start trying to force autovacuums when oldest_datminmxid gets to * be more than autovacuum_multixact_freeze_max_age mxids old. @@ -2309,25 +2116,14 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, * its value. See SetTransactionIdLimit. */ multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; - if (multiVacLimit < FirstMultiXactId) - multiVacLimit += FirstMultiXactId; /* Grab lock for just long enough to set the new limit values */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestMultiXactId = oldest_datminmxid; MultiXactState->oldestMultiXactDB = oldest_datoid; MultiXactState->multiVacLimit = multiVacLimit; - MultiXactState->multiWarnLimit = multiWarnLimit; - MultiXactState->multiStopLimit = multiStopLimit; - MultiXactState->multiWrapLimit = multiWrapLimit; - curMulti = MultiXactState->nextMXact; LWLockRelease(MultiXactGenLock); - /* Log the info */ - ereport(DEBUG1, - (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u", - multiWrapLimit, oldest_datoid))); - /* * Computing the actual limits is only possible once the data directory is * in a consistent state. There's no need to compute the limits while @@ -2339,59 +2135,6 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, return; Assert(!InRecovery); - - /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); - - /* - * If past the autovacuum force point, immediately signal an autovac - * request. The reason for this is that autovac only processes one - * database per invocation. Once it's finished cleaning up the oldest - * database, it'll call here, and we'll signal the postmaster to start - * another iteration immediately if there are still any old databases. - */ - if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || - needs_offset_vacuum) && IsUnderPostmaster) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - /* Give an immediate warning if past the wrap warn point */ - if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) - { - char *oldest_datname; - - /* - * We can be called when not inside a transaction, for example during - * StartupXLOG(). In such a case we cannot do database access, so we - * must just report the oldest DB's OID. - * - * Note: it's also possible that get_database_name fails and returns - * NULL, for example because the database just got dropped. We'll - * still warn, even though the warning might now be unnecessary. - */ - if (IsTransactionState()) - oldest_datname = get_database_name(oldest_datoid); - else - oldest_datname = NULL; - - if (oldest_datname) - ereport(WARNING, - (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", - "database \"%s\" must be vacuumed before %u more MultiXactIds are used", - multiWrapLimit - curMulti, - oldest_datname, - multiWrapLimit - curMulti), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(WARNING, - (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", - "database with OID %u must be vacuumed before %u more MultiXactIds are used", - multiWrapLimit - curMulti, - oldest_datoid, - multiWrapLimit - curMulti), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } } /* @@ -2416,8 +2159,8 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, } if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", - minMultiOffset); + debug_elog3(DEBUG2, "MultiXact: setting next offset to %llu", + (unsigned long long) minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } LWLockRelease(MultiXactGenLock); @@ -2489,7 +2232,7 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) { int flagsoff; int flagsbit; - uint32 difference; + uint64 difference; /* * Only zero when at first entry of a page. @@ -2510,23 +2253,7 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockRelease(MultiXactMemberSLRULock); } - /* - * Compute the number of items till end of current page. Careful: if - * addition of unsigned ints wraps around, we're at the last page of - * the last segment; since that page holds a different number of items - * than other pages, we need to do it differently. - */ - if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) - { - /* - * This is the last page of the last segment; we can compute the - * number of items left to allocate in it without modulo - * arithmetic. - */ - difference = MaxMultiXactOffset - offset + 1; - } - else - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; /* * Advance to next page, taking care to properly handle the wraparound @@ -2590,184 +2317,6 @@ GetOldestMultiXactId(void) return oldestMXact; } -/* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. - * - * To do so determine what's the oldest member offset and install the limit - * info in MultiXactState, where it can be used to prevent overrun of old data - * in the members SLRU area. - * - * The return value is true if emergency autovacuum is required and false - * otherwise. - */ -static bool -SetOffsetVacuumLimit(bool is_startup) -{ - MultiXactId oldestMultiXactId; - MultiXactId nextMXact; - MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset prevOldestOffset; - MultiXactOffset nextOffset; - bool oldestOffsetKnown = false; - bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; - - /* - * NB: Have to prevent concurrent truncation, we might otherwise try to - * lookup an oldestMulti that's concurrently getting truncated away. - */ - LWLockAcquire(MultiXactTruncationLock, LW_SHARED); - - /* Read relevant fields from shared memory. */ - LWLockAcquire(MultiXactGenLock, LW_SHARED); - oldestMultiXactId = MultiXactState->oldestMultiXactId; - nextMXact = MultiXactState->nextMXact; - nextOffset = MultiXactState->nextOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; - Assert(MultiXactState->finishedStartup); - LWLockRelease(MultiXactGenLock); - - /* - * Determine the offset of the oldest multixact. Normally, we can read - * the offset from the multixact itself, but there's an important special - * case: if there are no multixacts in existence at all, oldestMXact - * obviously can't point to one. It will instead point to the multixact - * ID that will be assigned the next time one is needed. - */ - if (oldestMultiXactId == nextMXact) - { - /* - * When the next multixact gets created, it will be stored at the next - * offset. - */ - oldestOffset = nextOffset; - oldestOffsetKnown = true; - } - else - { - /* - * Figure out where the oldest existing multixact's offsets are - * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, - * the supposedly-earliest multixact might not really exist. We are - * careful not to fail in that case. - */ - oldestOffsetKnown = - find_multixact_start(oldestMultiXactId, &oldestOffset); - - if (oldestOffsetKnown) - ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %llu", - (unsigned long long) oldestOffset))); - else - ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %llu does not exist on disk", - (unsigned long long) oldestMultiXactId))); - } - - LWLockRelease(MultiXactTruncationLock); - - /* - * If we can, compute limits (and install them MultiXactState) to prevent - * overrun of old data in the members SLRU area. We can only do so if the - * oldest offset is known though. - */ - if (oldestOffsetKnown) - { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %llu based on MultiXact %llu", - (unsigned long long) offsetStopLimit, - (unsigned long long) oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) - { - /* - * If we failed to get the oldest offset this time, but we have a - * value from a previous pass through this function, use the old - * values rather than automatically forcing an emergency autovacuum - * cycle again. - */ - oldestOffset = prevOldestOffset; - oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; - } - - /* Install the computed values */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; - LWLockRelease(MultiXactGenLock); - - /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. - */ - return !oldestOffsetKnown || - (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); -} - -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; -} - /* * Find the starting offset of the given MultiXactId. * @@ -2811,97 +2360,6 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) return true; } -/* - * Determine how many multixacts, and how many multixact members, currently - * exist. Return false if unable to determine. - */ -static bool -ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) -{ - MultiXactOffset nextOffset; - MultiXactOffset oldestOffset; - MultiXactId oldestMultiXactId; - MultiXactId nextMultiXactId; - bool oldestOffsetKnown; - - LWLockAcquire(MultiXactGenLock, LW_SHARED); - nextOffset = MultiXactState->nextOffset; - oldestMultiXactId = MultiXactState->oldestMultiXactId; - nextMultiXactId = MultiXactState->nextMXact; - oldestOffset = MultiXactState->oldestOffset; - oldestOffsetKnown = MultiXactState->oldestOffsetKnown; - LWLockRelease(MultiXactGenLock); - - if (!oldestOffsetKnown) - return false; - - *members = nextOffset - oldestOffset; - *multixacts = nextMultiXactId - oldestMultiXactId; - return true; -} - -/* - * Multixact members can be removed once the multixacts that refer to them - * are older than every datminmxid. autovacuum_multixact_freeze_max_age and - * vacuum_multixact_freeze_table_age work together to make sure we never have - * too many multixacts; we hope that, at least under normal circumstances, - * this will also be sufficient to keep us from using too many offsets. - * However, if the average multixact has many members, we might exhaust the - * members space while still using few enough members that these limits fail - * to trigger full table scans for relminmxid advancement. At that point, - * we'd have no choice but to start failing multixact-creating operations - * with an error. - * - * To prevent that, if more than a threshold portion of the members space is - * used, we effectively reduce autovacuum_multixact_freeze_max_age and - * to a value just less than the number of multixacts in use. We hope that - * this will quickly trigger autovacuuming on the table or tables with the - * oldest relminmxid, thus allowing datminmxid values to advance and removing - * some members. - * - * As the fraction of the member space currently in use grows, we become - * more aggressive in clamping this value. That not only causes autovacuum - * to ramp up, but also makes any manual vacuums the user issues more - * aggressive. This happens because vacuum_set_xid_limits() clamps the - * freeze table and the minimum freeze age based on the effective - * autovacuum_multixact_freeze_max_age this function returns. In the worst - * case, we'll claim the freeze_max_age to zero, and every vacuum of any - * table will try to freeze every multixact. - * - * It's possible that these thresholds should be user-tunable, but for now - * we keep it simple. - */ -int -MultiXactMemberFreezeThreshold(void) -{ - MultiXactOffset members; - uint32 multixacts; - uint32 victim_multixacts; - double fraction; - - /* If we can't determine member space utilization, assume the worst. */ - if (!ReadMultiXactCounts(&multixacts, &members)) - return 0; - - /* If member space utilization is low, no special action is required. */ - if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) - return autovacuum_multixact_freeze_max_age; - - /* - * Compute a target for relminmxid advancement. The number of multixacts - * we try to eliminate from the system is based on how far we are past - * MULTIXACT_MEMBER_SAFE_THRESHOLD. - */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); - victim_multixacts = multixacts * fraction; - - /* fraction could be > 1.0, but lowest possible freeze age is zero */ - if (victim_multixacts > multixacts) - return 0; - return multixacts - victim_multixacts; -} - typedef struct mxtruncinfo { int64 earliestExistingPage; @@ -2928,35 +2386,12 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data /* * Delete members segments [oldest, newOldest) - * - * The members SLRU can, in contrast to the offsets one, be filled to almost - * the full range at once. This means SimpleLruTruncate() can't trivially be - * used - instead the to-be-deleted range is computed using the offsets - * SLRU. C.f. TruncateMultiXact(). */ static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int startsegment = MXOffsetToMemberSegment(oldestOffset); - int endsegment = MXOffsetToMemberSegment(newOldestOffset); - int segment = startsegment; - - /* - * Delete all the segments but the last one. The last segment can still - * contain, possibly partially, valid data. - */ - while (segment != endsegment) - { - elog(DEBUG2, "truncating multixact members segment %x", segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); - - /* move to next segment, handling wraparound correctly */ - if (segment == maxsegment) - segment = 0; - else - segment += 1; - } + SimpleLruTruncate(MultiXactMemberCtl, + MXOffsetToMemberPage(newOldestOffset)); } /* @@ -3075,7 +2510,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) { ereport(LOG, (errmsg("oldest MultiXact %llu not found, earliest MultiXact %llu, skipping truncation", - (unsigned long long) oldestMulti, (unsigned long long) earliest))); + (unsigned long long) oldestMulti, + (unsigned long long) earliest))); LWLockRelease(MultiXactTruncationLock); return; } @@ -3099,14 +2535,14 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) } elog(DEBUG1, "performing multixact truncation: " - "offsets [%llu, %llu), offsets segments [%x, %x), " - "members [%u, %u), members segments [%x, %x)", + "offsets [%llu, %llu), offsets segments [%012llx, %012llx), " + "members [%lld, %lld), members segments [%012llx, %012llx)", (unsigned long long) oldestMulti, (unsigned long long) newOldestMulti, - MultiXactIdToOffsetSegment(oldestMulti), - MultiXactIdToOffsetSegment(newOldestMulti), - oldestOffset, newOldestOffset, - MXOffsetToMemberSegment(oldestOffset), - MXOffsetToMemberSegment(newOldestOffset)); + (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti), + (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti), + (long long) oldestOffset, (long long) newOldestOffset, + (unsigned long long) MXOffsetToMemberSegment(oldestOffset), + (unsigned long long) MXOffsetToMemberSegment(newOldestOffset)); /* * Do truncation, and the WAL logging of the truncation, in a critical @@ -3180,7 +2616,7 @@ MultiXactOffsetPagePrecedes(int64 page1, int64 page2) /* * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. + * purposes. There is no "invalid offset number" so use the numbers verbatim. */ static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2) @@ -3205,7 +2641,7 @@ MultiXactMemberPagePrecedes(int64 page1, int64 page2) bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) { - int32 diff = (int32) (multi1 - multi2); + int64 diff = (int64) (multi1 - multi2); return (diff < 0); } @@ -3219,7 +2655,7 @@ MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) { - int32 diff = (int32) (multi1 - multi2); + int64 diff = (int64) (multi1 - multi2); return (diff <= 0); } @@ -3231,7 +2667,7 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) { - int32 diff = (int32) (offset1 - offset2); + int64 diff = (int64) (offset1 - offset2); return (diff < 0); } @@ -3355,15 +2791,16 @@ multixact_redo(XLogReaderState *record) SizeOfMultiXactTruncate); elog(DEBUG1, "replaying multixact truncation: " - "offsets [%llu, %llu), offsets segments [%x, %x), " - "members [%u, %u), members segments [%x, %x)", + "offsets [%llu, %llu), offsets segments [%012llx, %012llx), " + "members [%llu, %llu), members segments [%012llx, %012llx)", (unsigned long long) xlrec.startTruncOff, (unsigned long long) xlrec.endTruncOff, - MultiXactIdToOffsetSegment(xlrec.startTruncOff), - MultiXactIdToOffsetSegment(xlrec.endTruncOff), - xlrec.startTruncMemb, xlrec.endTruncMemb, - MXOffsetToMemberSegment(xlrec.startTruncMemb), - MXOffsetToMemberSegment(xlrec.endTruncMemb)); + (unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff), + (unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff), + (unsigned long long) xlrec.startTruncMemb, + (unsigned long long) xlrec.endTruncMemb, + (unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb), + (unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb)); /* should not be required, but more than cheap enough */ LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); @@ -3407,7 +2844,8 @@ pg_get_multixact_members(PG_FUNCTION_ARGS) if (mxid < FirstMultiXactId) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid MultiXactId: %llu", (unsigned long long) mxid))); + errmsg("invalid MultiXactId: %llu", + (unsigned long long) mxid))); if (SRF_IS_FIRSTCALL()) { diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 9e765c6c28..c186e177ed 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1428,7 +1428,7 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) * must not assign. */ lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */ - rhs = lhs + (1U << 31); + rhs = lhs + (1ULL << 63); Assert(TransactionIdPrecedes(lhs, rhs)); Assert(TransactionIdPrecedes(rhs, lhs)); Assert(!TransactionIdPrecedes(lhs - 1, rhs)); @@ -1444,13 +1444,14 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page)); Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page)); Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page) - || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */ + || (1ULL << 63) % per_page != 0); /* See CommitTsPagePrecedes() */ Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page) - || (1U << 31) % per_page != 0); + || (1ULL << 63) % per_page != 0); Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page)); Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page)); Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page)); + /* * GetNewTransactionId() has assigned the last XID it can safely use, and * that XID is in the *LAST* page of the second segment. We must not @@ -1460,7 +1461,7 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) newestXact = newestPage * per_page + offset; Assert(newestXact / per_page == newestPage); oldestXact = newestXact + 1; - oldestXact -= 1U << 31; + oldestXact -= 1ULL << 63; oldestPage = oldestXact / per_page; Assert(!SlruMayDeleteSegment(ctl, (newestPage - @@ -1476,7 +1477,7 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) newestXact = newestPage * per_page + offset; Assert(newestXact / per_page == newestPage); oldestXact = newestXact + 1; - oldestXact -= 1U << 31; + oldestXact -= 1ULL << 63; oldestPage = oldestXact / per_page; Assert(!SlruMayDeleteSegment(ctl, (newestPage - @@ -1582,7 +1583,7 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) if ((len == 12 || len == 13 || len == 14) && strspn(clde->d_name, "0123456789ABCDEF") == len) { - segno = (int) strtol(clde->d_name, NULL, 16); + segno = (int) strtoi64(clde->d_name, NULL, 16); segpage = segno * SLRU_PAGES_PER_SEGMENT; elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s", diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index e2e20ed06c..2d124d9600 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -212,11 +212,14 @@ void BootStrapSUBTRANS(void) { int slotno; + int64 pageno; + + pageno = TransactionIdToPage(XidFromFullTransactionId(ShmemVariableCache->nextXid)); LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); + slotno = ZeroSUBTRANSPage(pageno); /* Make sure it's written out */ SimpleLruWritePage(SubTransCtl, slotno); @@ -269,9 +272,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID) { (void) ZeroSUBTRANSPage(startPage); startPage++; - /* must account for wraparound */ - if (startPage > TransactionIdToPage(MaxTransactionId)) - startPage = 0; } (void) ZeroSUBTRANSPage(startPage); @@ -348,6 +348,7 @@ TruncateSUBTRANS(TransactionId oldestXact) * a page and oldestXact == next XID. In that case, if we didn't subtract * one, we'd trigger SimpleLruTruncate's wraparound detection. */ + TransactionIdRetreat(oldestXact); cutoffPage = TransactionIdToPage(oldestXact); diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 27410c4697..a2a41e58f2 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -274,14 +274,14 @@ TransactionIdPrecedes(TransactionId id1, TransactionId id2) { /* * If either ID is a permanent XID then we can just do unsigned - * comparison. If both are normal, do a modulo-2^32 comparison. + * comparison. If both are normal, do a modulo-2^64 comparison. */ - int32 diff; + int64 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 < id2); - diff = (int32) (id1 - id2); + diff = (int64) (id1 - id2); return (diff < 0); } @@ -291,12 +291,12 @@ TransactionIdPrecedes(TransactionId id1, TransactionId id2) bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) { - int32 diff; + int64 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 <= id2); - diff = (int32) (id1 - id2); + diff = (int64) (id1 - id2); return (diff <= 0); } @@ -306,12 +306,12 @@ TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) bool TransactionIdFollows(TransactionId id1, TransactionId id2) { - int32 diff; + int64 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 > id2); - diff = (int32) (id1 - id2); + diff = (int64) (id1 - id2); return (diff > 0); } @@ -321,12 +321,12 @@ TransactionIdFollows(TransactionId id1, TransactionId id2) bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2) { - int32 diff; + int64 diff; if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) return (id1 >= id2); - diff = (int32) (id1 - id2); + diff = (int64) (id1 - id2); return (diff >= 0); } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 4b81dfee16..0732393539 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -476,8 +476,8 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->lxid = xid; proc->backendId = InvalidBackendId; } - proc->xid = xid; - Assert(proc->xmin == InvalidTransactionId); + pg_atomic_write_u64(&proc->xid, xid); + Assert(pg_atomic_read_u64(&proc->xmin) == InvalidTransactionId); proc->delayChkptFlags = 0; proc->statusFlags = 0; proc->pid = 0; @@ -792,7 +792,7 @@ pg_prepared_xact(PG_FUNCTION_ARGS) * Form tuple with appropriate data. */ - values[0] = TransactionIdGetDatum(proc->xid); + values[0] = TransactionIdGetDatum(pg_atomic_read_u64(&proc->xid)); values[1] = CStringGetTextDatum(gxact->gid); values[2] = TimestampTzGetDatum(gxact->prepared_at); values[3] = ObjectIdGetDatum(gxact->owner); @@ -943,7 +943,7 @@ TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) /************************************************************************/ #define TwoPhaseFilePath(path, xid) \ - snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid) + snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%016llX", (unsigned long long) xid) /* * 2PC state file format: @@ -1882,13 +1882,13 @@ restoreTwoPhaseData(void) cldir = AllocateDir(TWOPHASE_DIR); while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) { - if (strlen(clde->d_name) == 8 && - strspn(clde->d_name, "0123456789ABCDEF") == 8) + if (strlen(clde->d_name) == 16 && + strspn(clde->d_name, "0123456789ABCDEF") == 16) { TransactionId xid; char *buf; - xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + xid = (TransactionId) strtou64(clde->d_name, NULL, 16); buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, true, false, false); @@ -2220,7 +2220,6 @@ ProcessTwoPhaseBuffer(TransactionId xid, if (fromdisk) { - /* Read and validate file */ buf = ReadTwoPhaseFile(xid, false); } else diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 849a7ce9d6..53c79d9a31 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -66,9 +66,9 @@ GetNewTransactionId(bool isSubXact) if (IsBootstrapProcessingMode()) { Assert(!isSubXact); - MyProc->xid = BootstrapTransactionId; - ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId; - return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId); + pg_atomic_write_u64(&MyProc->xid, BootstrapTransactionId); + pg_atomic_write_u64(&ProcGlobal->xids[MyProc->pgxactoff], BootstrapTransactionId); + return FullTransactionIdFromXid(BootstrapTransactionId); } /* safety check, we should never get this far in a HS standby */ @@ -102,11 +102,6 @@ GetNewTransactionId(bool isSubXact) * possibility of deadlock while doing get_database_name(). First, * copy all the shared values we'll need in this path. */ - TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit; - TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit; - TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit; - Oid oldest_datoid = ShmemVariableCache->oldestXidDB; - LWLockRelease(XidGenLock); /* @@ -117,48 +112,6 @@ GetNewTransactionId(bool isSubXact) if (IsUnderPostmaster && (xid % 65536) == 0) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - if (IsUnderPostmaster && - TransactionIdFollowsOrEquals(xid, xidStopLimit)) - { - char *oldest_datname = get_database_name(oldest_datoid); - - /* complain even if that DB has disappeared */ - if (oldest_datname) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"", - oldest_datname), - errhint("Stop the postmaster and vacuum that database in single-user mode.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u", - oldest_datoid), - errhint("Stop the postmaster and vacuum that database in single-user mode.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } - else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit)) - { - char *oldest_datname = get_database_name(oldest_datoid); - - /* complain even if that DB has disappeared */ - if (oldest_datname) - ereport(WARNING, - (errmsg("database \"%s\" must be vacuumed within %u transactions", - oldest_datname, - xidWrapLimit - xid), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(WARNING, - (errmsg("database with OID %u must be vacuumed within %u transactions", - oldest_datoid, - xidWrapLimit - xid), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } - /* Re-acquire lock and start over */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); full_xid = ShmemVariableCache->nextXid; @@ -228,8 +181,8 @@ GetNewTransactionId(bool isSubXact) Assert(!MyProc->subxidStatus.overflowed); /* LWLockRelease acts as barrier */ - MyProc->xid = xid; - ProcGlobal->xids[MyProc->pgxactoff] = xid; + pg_atomic_write_u64(&MyProc->xid, xid); + pg_atomic_write_u64(&ProcGlobal->xids[MyProc->pgxactoff], xid); } else { @@ -270,7 +223,7 @@ ReadNextFullTransactionId(void) } /* - * Advance nextXid to the value after a given xid. The epoch is inferred. + * Advance nextXid to the value after a given xid. * This must only be called during recovery or from two-phase start-up code. */ void @@ -278,7 +231,6 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid) { FullTransactionId newNextFullXid; TransactionId next_xid; - uint32 epoch; /* * It is safe to read nextXid without a lock, because this is only called @@ -292,19 +244,9 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid) if (!TransactionIdFollowsOrEquals(xid, next_xid)) return; - /* - * Compute the FullTransactionId that comes after the given xid. To do - * this, we preserve the existing epoch, but detect when we've wrapped - * into a new epoch. This is necessary because WAL records and 2PC state - * currently contain 32 bit xids. The wrap logic is safe in those cases - * because the span of active xids cannot exceed one epoch at any given - * point in the WAL stream. - */ + /* Compute the FullTransactionId that comes after the given xid. */ TransactionIdAdvance(xid); - epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); - if (unlikely(xid < next_xid)) - ++epoch; - newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid); + newNextFullXid = FullTransactionIdFromXid(xid); /* * We still need to take a lock to modify the value when there are @@ -345,54 +287,10 @@ void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) { TransactionId xidVacLimit; - TransactionId xidWarnLimit; - TransactionId xidStopLimit; - TransactionId xidWrapLimit; TransactionId curXid; Assert(TransactionIdIsNormal(oldest_datfrozenxid)); - /* - * The place where we actually get into deep trouble is halfway around - * from the oldest potentially-existing XID. (This calculation is - * probably off by one or two counts, because the special XIDs reduce the - * size of the loop a little bit. But we throw in plenty of slop below, - * so it doesn't matter.) - */ - xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1); - if (xidWrapLimit < FirstNormalTransactionId) - xidWrapLimit += FirstNormalTransactionId; - - /* - * We'll refuse to continue assigning XIDs in interactive mode once we get - * within 3M transactions of data loss. This leaves lots of room for the - * DBA to fool around fixing things in a standalone backend, while not - * being significant compared to total XID space. (VACUUM requires an XID - * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA - * might do by reflex, assigns an XID. Hence, we had better be sure - * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two - * completely-idle segments. In the event of edge-case bugs involving - * page or segment arithmetic, idle segments render the bugs unreachable - * outside of single-user mode. - */ - xidStopLimit = xidWrapLimit - 3000000; - if (xidStopLimit < FirstNormalTransactionId) - xidStopLimit -= FirstNormalTransactionId; - - /* - * We'll start complaining loudly when we get within 40M transactions of - * data loss. This is kind of arbitrary, but if you let your gas gauge - * get down to 2% of full, would you be looking for the next gas station? - * We need to be fairly liberal about this number because there are lots - * of scenarios where most transactions are done by automatic clients that - * won't pay attention to warnings. (No, we're not gonna make this - * configurable. If you know enough to configure it, you know enough to - * not get in this kind of trouble in the first place.) - */ - xidWarnLimit = xidWrapLimit - 40000000; - if (xidWarnLimit < FirstNormalTransactionId) - xidWarnLimit -= FirstNormalTransactionId; - /* * We'll start trying to force autovacuums when oldest_datfrozenxid gets * to be more than autovacuum_freeze_max_age transactions old. @@ -416,18 +314,10 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) LWLockAcquire(XidGenLock, LW_EXCLUSIVE); ShmemVariableCache->oldestXid = oldest_datfrozenxid; ShmemVariableCache->xidVacLimit = xidVacLimit; - ShmemVariableCache->xidWarnLimit = xidWarnLimit; - ShmemVariableCache->xidStopLimit = xidStopLimit; - ShmemVariableCache->xidWrapLimit = xidWrapLimit; ShmemVariableCache->oldestXidDB = oldest_datoid; curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); LWLockRelease(XidGenLock); - /* Log the info */ - ereport(DEBUG1, - (errmsg_internal("transaction ID wrap limit is %u, limited by database with OID %u", - xidWrapLimit, oldest_datoid))); - /* * If past the autovacuum force point, immediately signal an autovac * request. The reason for this is that autovac only processes one @@ -438,41 +328,6 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) && IsUnderPostmaster && !InRecovery) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - /* Give an immediate warning if past the wrap warn point */ - if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery) - { - char *oldest_datname; - - /* - * We can be called when not inside a transaction, for example during - * StartupXLOG(). In such a case we cannot do database access, so we - * must just report the oldest DB's OID. - * - * Note: it's also possible that get_database_name fails and returns - * NULL, for example because the database just got dropped. We'll - * still warn, even though the warning might now be unnecessary. - */ - if (IsTransactionState()) - oldest_datname = get_database_name(oldest_datoid); - else - oldest_datname = NULL; - - if (oldest_datname) - ereport(WARNING, - (errmsg("database \"%s\" must be vacuumed within %u transactions", - oldest_datname, - xidWrapLimit - curXid), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - else - ereport(WARNING, - (errmsg("database with OID %u must be vacuumed within %u transactions", - oldest_datoid, - xidWrapLimit - curXid), - errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" - "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); - } } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index a5116d10b1..8a741b48f8 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -402,7 +402,6 @@ IsAbortedTransactionBlockState(void) return false; } - /* * GetTopTransactionId * @@ -5673,6 +5672,17 @@ XactLogCommitRecord(TimestampTz commit_time, xl_subxacts.nsubxacts = nsubxacts; } + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid_lo = (uint32) (twophase_xid & 0xFFFFFFFF); + xl_twophase.xid_hi = (uint32) (twophase_xid >> 32); + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + if (nrels > 0) { xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILELOCATORS; @@ -5692,16 +5702,6 @@ XactLogCommitRecord(TimestampTz commit_time, xl_invals.nmsgs = nmsgs; } - if (TransactionIdIsValid(twophase_xid)) - { - xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; - xl_twophase.xid = twophase_xid; - Assert(twophase_gid != NULL); - - if (XLogLogicalInfoActive()) - xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; - } - /* dump transaction origin information */ if (replorigin_session_origin != InvalidRepOriginId) { @@ -5822,6 +5822,17 @@ XactLogAbortRecord(TimestampTz abort_time, xl_subxacts.nsubxacts = nsubxacts; } + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid_lo = (uint32) (twophase_xid & 0xFFFFFFFF); + xl_twophase.xid_hi = (uint32) (twophase_xid >> 32); + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + if (nrels > 0) { xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILELOCATORS; @@ -5838,7 +5849,8 @@ XactLogAbortRecord(TimestampTz abort_time, if (TransactionIdIsValid(twophase_xid)) { xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; - xl_twophase.xid = twophase_xid; + xl_twophase.xid_lo = (uint32) (twophase_xid & 0xFFFFFFFF); + xl_twophase.xid_hi = (uint32) (twophase_xid >> 32); Assert(twophase_gid != NULL); if (XLogLogicalInfoActive()) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3c74f1502a..b3ac2393eb 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4714,8 +4714,8 @@ BootStrapXLOG(void) checkPoint.PrevTimeLineID = BootstrapTimeLineID; checkPoint.fullPageWrites = fullPageWrites; checkPoint.nextXid = - FullTransactionIdFromEpochAndXid(0, Max(FirstNormalTransactionId, - start_xid)); + FullTransactionIdFromXid(Max(FirstNormalTransactionId, + start_xid)); checkPoint.nextOid = FirstGenbkiObjectId; checkPoint.nextMulti = Max(FirstMultiXactId, start_mxid); checkPoint.nextMultiOffset = start_mxoff; @@ -6814,7 +6814,7 @@ CreateCheckPoint(int flags) UpdateControlFile(); LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ + /* Update shared-memory copy of checkpoint XID/base */ SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); @@ -7840,7 +7840,7 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ + /* Update shared-memory copy of checkpoint XID/base */ SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); @@ -7901,7 +7901,7 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ + /* Update shared-memory copy of checkpoint XID/base */ SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->ckptFullXid = checkPoint.nextXid; SpinLockRelease(&XLogCtl->info_lck); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ca15ebbf2..fbeec030f9 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -260,6 +260,11 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) BufferGetTag(buffer, ®buf->rlocator, ®buf->forkno, ®buf->block); regbuf->page = BufferGetPage(buffer); regbuf->flags = flags; + if (IsBufferConverted(buffer)) + { + regbuf->flags |= REGBUF_CONVERTED; + MarkBufferConverted(buffer, false); + } regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; regbuf->rdata_len = 0; @@ -583,6 +588,8 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, needs_backup = true; else if (regbuf->flags & REGBUF_NO_IMAGE) needs_backup = false; + else if (regbuf->flags & REGBUF_CONVERTED) + needs_backup = true; else if (!doPageWrites) needs_backup = false; else diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 5a8fe81f82..f5f9f2cb18 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -2144,37 +2144,3 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return true; } - -#ifndef FRONTEND - -/* - * Extract the FullTransactionId from a WAL record. - */ -FullTransactionId -XLogRecGetFullXid(XLogReaderState *record) -{ - TransactionId xid, - next_xid; - uint32 epoch; - - /* - * This function is only safe during replay, because it depends on the - * replay state. See AdvanceNextFullTransactionIdPastXid() for more. - */ - Assert(AmStartupProcess() || !IsUnderPostmaster); - - xid = XLogRecGetXid(record); - next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); - epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); - - /* - * If xid is numerically greater than next_xid, it has to be from the last - * epoch. - */ - if (unlikely(xid > next_xid)) - --epoch; - - return FullTransactionIdFromEpochAndXid(epoch, xid); -} - -#endif diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index e83c7dded9..5a72d24f6f 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -816,7 +816,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, wasShutdown ? "true" : "false"))); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", - U64FromFullTransactionId(checkPoint.nextXid), + XidFromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, (errmsg_internal("next MultiXactId: %llu; next MultiXactOffset: %llu", diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index a6edfcda32..af486d324b 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -120,7 +120,7 @@ static const struct typinfo TypInfo[] = { F_OIDIN, F_OIDOUT}, {"tid", TIDOID, 0, 6, false, TYPALIGN_SHORT, TYPSTORAGE_PLAIN, InvalidOid, F_TIDIN, F_TIDOUT}, - {"xid", XIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, + {"xid", XIDOID, 0, 8, FLOAT8PASSBYVAL, TYPALIGN_XID, TYPSTORAGE_PLAIN, InvalidOid, F_XIDIN, F_XIDOUT}, {"cid", CIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_CIDIN, F_CIDOUT}, @@ -252,15 +252,13 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) break; case 'm': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxid = value; + start_mxid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxid) /* overflow */ + !StartMultiXactIdIsValid(start_mxid)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -270,15 +268,13 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) break; case 'o': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxoff = value; + start_mxoff = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxoff) /* overflow */ + !StartMultiXactOffsetIsValid(start_mxoff)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -303,15 +299,13 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) break; case 'x': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_xid = value; + start_xid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_xid) /* overflow */ + !StartTransactionIdIsValid(start_xid)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 5b49cc5a09..694c2be546 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -159,8 +159,8 @@ static const FormData_pg_attribute a2 = { .attnum = MinTransactionIdAttributeNumber, .attcacheoff = -1, .atttypmod = -1, - .attbyval = true, - .attalign = TYPALIGN_INT, + .attbyval = FLOAT8PASSBYVAL, + .attalign = TYPALIGN_XID, .attstorage = TYPSTORAGE_PLAIN, .attnotnull = true, .attislocal = true, @@ -187,8 +187,8 @@ static const FormData_pg_attribute a4 = { .attnum = MaxTransactionIdAttributeNumber, .attcacheoff = -1, .atttypmod = -1, - .attbyval = true, - .attalign = TYPALIGN_INT, + .attbyval = FLOAT8PASSBYVAL, + .attalign = TYPALIGN_XID, .attstorage = TYPSTORAGE_PLAIN, .attnotnull = true, .attislocal = true, diff --git a/src/backend/catalog/pg_inherits.c b/src/backend/catalog/pg_inherits.c index 92afbc2f25..3a1eda413f 100644 --- a/src/backend/catalog/pg_inherits.c +++ b/src/backend/catalog/pg_inherits.c @@ -146,7 +146,7 @@ find_inheritance_children_extended(Oid parentrelId, bool omit_detached, TransactionId xmin; Snapshot snap; - xmin = HeapTupleHeaderGetXmin(inheritsTuple->t_data); + xmin = HeapTupleGetXmin(inheritsTuple); snap = GetActiveSnapshot(); if (!XidInMVCCSnapshot(xmin, snap)) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 056dca8e47..56295b9aa6 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -187,7 +187,7 @@ typedef struct AsyncQueueEntry } AsyncQueueEntry; /* Currently, no field of AsyncQueueEntry requires more than int alignment */ -#define QUEUEALIGN(len) INTALIGN(len) +#define QUEUEALIGN(len) TYPEALIGN(8, len) #define AsyncQueueEntryEmptySize (offsetof(AsyncQueueEntry, data) + 2) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 96b46cbc02..44fd9efb23 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -130,7 +130,8 @@ static void CreateDatabaseUsingWalLog(Oid src_dboid, Oid dst_dboid, Oid src_tsid static List *ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath); static List *ScanSourceDatabasePgClassPage(Page page, Buffer buf, Oid tbid, Oid dbid, char *srcpath, - List *rlocatorlist, Snapshot snapshot); + List *rlocatorlist, Snapshot snapshot, + bool is_toast); static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple, Oid tbid, Oid dbid, char *srcpath); @@ -308,9 +309,10 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) } /* Append relevant pg_class tuples for current page to rlocatorlist. */ + /* No toast is expected in sys tables */ rlocatorlist = ScanSourceDatabasePgClassPage(page, buf, tbid, dbid, srcpath, rlocatorlist, - snapshot); + snapshot, false); UnlockReleaseBuffer(buf); } @@ -328,7 +330,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) static List * ScanSourceDatabasePgClassPage(Page page, Buffer buf, Oid tbid, Oid dbid, char *srcpath, List *rlocatorlist, - Snapshot snapshot) + Snapshot snapshot, bool is_toast) { BlockNumber blkno = BufferGetBlockNumber(buf); OffsetNumber offnum; @@ -358,6 +360,7 @@ ScanSourceDatabasePgClassPage(Page page, Buffer buf, Oid tbid, Oid dbid, tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationRelationId; + HeapTupleCopyBaseFromPage(buf, &tuple, page, is_toast); /* Skip tuples that are not visible to this snapshot. */ if (HeapTupleSatisfiesVisibility(&tuple, snapshot, buf)) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index fd56066c13..f3b19cf188 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1656,7 +1656,7 @@ DefineIndex(Oid relationId, set_indexsafe_procflags(); /* We should now definitely not be advertising any xmin. */ - Assert(MyProc->xmin == InvalidTransactionId); + Assert(pg_atomic_read_u64(&MyProc->xmin) == InvalidTransactionId); /* * The index is now valid in the sense that it contains all currently @@ -4339,8 +4339,8 @@ set_indexsafe_procflags(void) * This should only be called before installing xid or xmin in MyProc; * otherwise, concurrent processes could see an Xmin that moves backwards. */ - Assert(MyProc->xid == InvalidTransactionId && - MyProc->xmin == InvalidTransactionId); + Assert(pg_atomic_read_u64(&MyProc->xid) == InvalidTransactionId && + pg_atomic_read_u64(&MyProc->xmin) == InvalidTransactionId); LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyProc->statusFlags |= PROC_IN_SAFE_IC; diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 99c9f91cba..c72cab394f 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -48,6 +48,23 @@ #include "utils/syscache.h" #include "utils/varlena.h" +static inline void +SeqTupleHeaderSetXmin(HeapTupleHeader htup, TransactionId xid) +{ + htup->t_choice.t_heap.t_xmin = xid; +} + +static inline void +SeqTupleHeaderSetXmax(HeapTupleHeader htup, TransactionId xid) +{ + htup->t_choice.t_heap.t_xmax = xid; +} + +static inline TransactionId +SeqTupleHeaderGetRawXmax(HeapTupleHeader htup) +{ + return htup->t_choice.t_heap.t_xmax; +} /* * We don't want to log each fetching of a value from a sequence, @@ -397,10 +414,10 @@ fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum) * because if the current transaction aborts, no other xact will ever * examine the sequence tuple anyway. */ - HeapTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId); + SeqTupleHeaderSetXmin(tuple->t_data, FrozenTransactionId); HeapTupleHeaderSetXminFrozen(tuple->t_data); HeapTupleHeaderSetCmin(tuple->t_data, FirstCommandId); - HeapTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId); + SeqTupleHeaderSetXmax(tuple->t_data, InvalidTransactionId); tuple->t_data->t_infomask |= HEAP_XMAX_INVALID; ItemPointerSet(&tuple->t_data->t_ctid, 0, FirstOffsetNumber); @@ -1232,9 +1249,9 @@ read_seq_tuple(Relation rel, Buffer *buf, HeapTuple seqdatatuple) * this again if the update gets lost. */ Assert(!(seqdatatuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); - if (HeapTupleHeaderGetRawXmax(seqdatatuple->t_data) != InvalidTransactionId) + if (SeqTupleHeaderGetRawXmax(seqdatatuple->t_data) != InvalidTransactionId) { - HeapTupleHeaderSetXmax(seqdatatuple->t_data, InvalidTransactionId); + SeqTupleHeaderSetXmax(seqdatatuple->t_data, InvalidTransactionId); seqdatatuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; seqdatatuple->t_data->t_infomask |= HEAP_XMAX_INVALID; MarkBufferDirtyHint(*buf, true); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 7ccde07de9..c3d7ae61a9 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -63,13 +63,12 @@ /* * GUC parameters */ -int vacuum_freeze_min_age; -int vacuum_freeze_table_age; -int vacuum_multixact_freeze_min_age; -int vacuum_multixact_freeze_table_age; -int vacuum_failsafe_age; -int vacuum_multixact_failsafe_age; - +int64 vacuum_freeze_min_age; +int64 vacuum_freeze_table_age; +int64 vacuum_multixact_freeze_min_age; +int64 vacuum_multixact_freeze_table_age; +int64 vacuum_failsafe_age; +int64 vacuum_multixact_failsafe_age; /* A few variables that don't seem worth passing around as parameters */ static MemoryContext vac_context = NULL; @@ -955,10 +954,10 @@ get_all_vacuum_rels(int options) */ bool vacuum_set_xid_limits(Relation rel, - int freeze_min_age, - int multixact_freeze_min_age, - int freeze_table_age, - int multixact_freeze_table_age, + int64 freeze_min_age, + int64 multixact_freeze_min_age, + int64 freeze_table_age, + int64 multixact_freeze_table_age, TransactionId *oldestXmin, MultiXactId *oldestMxact, TransactionId *freezeLimit, @@ -970,7 +969,7 @@ vacuum_set_xid_limits(Relation rel, MultiXactId nextMXID, safeOldestMxact, aggressiveMXIDCutoff; - int effective_multixact_freeze_max_age; + int64 effective_multixact_freeze_max_age; /* * Acquire oldestXmin. @@ -1038,7 +1037,7 @@ vacuum_set_xid_limits(Relation rel, * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Determine the minimum multixact freeze age to use: as specified by @@ -1065,11 +1064,13 @@ vacuum_set_xid_limits(Relation rel, * held back to an unsafe degree in passing */ safeOldestXmin = nextXID - autovacuum_freeze_max_age; - if (!TransactionIdIsNormal(safeOldestXmin)) + if (nextXID > FirstNormalTransactionId + autovacuum_freeze_max_age) safeOldestXmin = FirstNormalTransactionId; + safeOldestMxact = nextMXID - effective_multixact_freeze_max_age; if (safeOldestMxact < FirstMultiXactId) safeOldestMxact = FirstMultiXactId; + if (TransactionIdPrecedes(*oldestXmin, safeOldestXmin)) ereport(WARNING, (errmsg("cutoff for removing and freezing tuples is far in the past"), @@ -1378,6 +1379,9 @@ vac_update_relstats(Relation relation, futurexid = false; if (frozenxid_updated) *frozenxid_updated = false; + + Assert(TransactionIdPrecedesOrEquals(frozenxid, ReadNextTransactionId())); + if (TransactionIdIsNormal(frozenxid) && oldfrozenxid != frozenxid) { bool update = false; @@ -1401,6 +1405,9 @@ vac_update_relstats(Relation relation, futuremxid = false; if (minmulti_updated) *minmulti_updated = false; + + Assert(MultiXactIdPrecedesOrEquals(minmulti, ReadNextMultiXactId())); + if (MultiXactIdIsValid(minmulti) && oldminmulti != minmulti) { bool update = false; @@ -1428,14 +1435,16 @@ vac_update_relstats(Relation relation, if (futurexid) ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("overwrote invalid relfrozenxid value %u with new value %u for table \"%s\"", - oldfrozenxid, frozenxid, + errmsg_internal("overwrote invalid relfrozenxid value %llu with new value %llu for table \"%s\"", + (unsigned long long) oldfrozenxid, + (unsigned long long) frozenxid, RelationGetRelationName(relation)))); if (futuremxid) ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("overwrote invalid relminmxid value %u with new value %u for table \"%s\"", - oldminmulti, minmulti, + errmsg_internal("overwrote invalid relminmxid value %llu with new value %llu for table \"%s\"", + (unsigned long long) oldminmulti, + (unsigned long long) minmulti, RelationGetRelationName(relation)))); } diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 9b9bbf00a9..2f37cfed60 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -3180,6 +3180,7 @@ ExecEvalFieldStoreDeForm(ExprState *state, ExprEvalStep *op, ExprContext *econte tmptup.t_len = HeapTupleHeaderGetDatumLength(tuphdr); ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tmptup); tmptup.t_data = tuphdr; heap_deform_tuple(&tmptup, tupDesc, diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 9df1f81ea8..d8c92f4846 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1070,6 +1070,7 @@ GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull) tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple); ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tmptup); tmptup.t_data = tuple; result = heap_getattr(&tmptup, diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 04454ad6e6..bbb060fdb6 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -3678,6 +3678,7 @@ ExecModifyTable(PlanState *pstate) HeapTupleHeaderGetDatumLength(oldtupdata.t_data); ItemPointerSetInvalid(&(oldtupdata.t_self)); /* Historically, view triggers see invalid t_tableOid. */ + HeapTupleCopyHeaderXids(&oldtupdata); oldtupdata.t_tableOid = (relkind == RELKIND_VIEW) ? InvalidOid : RelationGetRelid(resultRelInfo->ri_RelationDesc); diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index fd5796f1b9..26b60cc77e 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -1154,6 +1154,7 @@ SPI_modifytuple(Relation rel, HeapTuple tuple, int natts, int *attnum, mtuple->t_data->t_ctid = tuple->t_data->t_ctid; mtuple->t_self = tuple->t_self; mtuple->t_tableOid = tuple->t_tableOid; + HeapTupleCopyBase(mtuple, tuple); } else { diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 81b8c184a9..39cf494372 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -955,13 +955,13 @@ _read${n}(void) || $t eq 'bits32' || $t eq 'AclMode' || $t eq 'BlockNumber' - || $t eq 'Index' - || $t eq 'SubTransactionId') + || $t eq 'Index') { print $off "\tWRITE_UINT_FIELD($f);\n"; print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read; } - elsif ($t eq 'uint64') + elsif ($t eq 'uint64' + || $t eq 'SubTransactionId') { print $off "\tWRITE_UINT64_FIELD($f);\n"; print $rff "\tREAD_UINT64_FIELD($f);\n" unless $no_read; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 27e5cdee6f..59f2be3be9 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -817,7 +817,6 @@ _outConstraint(StringInfo str, const Constraint *node) } } - /* * outNode - * converts a Node into ascii string and append it to 'str' diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 6d5718ee4c..54697ed00d 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -229,7 +229,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), + !TransactionIdPrecedes(HeapTupleGetXmin(indexRelation->rd_indextuple), TransactionXmin)) { root->glob->transientPlan = true; diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1e90b72b74..1cbdadf792 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -123,8 +123,8 @@ int autovacuum_vac_ins_thresh; double autovacuum_vac_ins_scale; int autovacuum_anl_thresh; double autovacuum_anl_scale; -int autovacuum_freeze_max_age; -int autovacuum_multixact_freeze_max_age; +int64 autovacuum_freeze_max_age; +int64 autovacuum_multixact_freeze_max_age; double autovacuum_vac_cost_delay; int autovacuum_vac_cost_limit; @@ -147,10 +147,10 @@ static TransactionId recentXid; static MultiXactId recentMulti; /* Default freeze ages to use for autovacuum (varies by database) */ -static int default_freeze_min_age; -static int default_freeze_table_age; -static int default_multixact_freeze_min_age; -static int default_multixact_freeze_table_age; +static int64 default_freeze_min_age; +static int64 default_freeze_table_age; +static int64 default_multixact_freeze_min_age; +static int64 default_multixact_freeze_table_age; /* Memory context for long-lived data */ static MemoryContext AutovacMemCxt; @@ -326,15 +326,15 @@ static void FreeWorkerInfo(int code, Datum arg); static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map, TupleDesc pg_class_desc, - int effective_multixact_freeze_max_age); + int64 effective_multixact_freeze_max_age); static void recheck_relation_needs_vacanalyze(Oid relid, AutoVacOpts *avopts, Form_pg_class classForm, - int effective_multixact_freeze_max_age, + int64 effective_multixact_freeze_max_age, bool *dovacuum, bool *doanalyze, bool *wraparound); static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, Form_pg_class classForm, PgStat_StatTabEntry *tabentry, - int effective_multixact_freeze_max_age, + int64 effective_multixact_freeze_max_age, bool *dovacuum, bool *doanalyze, bool *wraparound); static void autovacuum_do_vac_analyze(autovac_table *tab, @@ -1150,6 +1150,7 @@ do_start_worker(void) ListCell *cell; TransactionId xidForceLimit; MultiXactId multiForceLimit; + int64 multiMembersThreshold; bool for_xid_wrap; bool for_multi_wrap; avw_dbase *avdb; @@ -1186,17 +1187,18 @@ do_start_worker(void) * particular tables, but not loosened.) */ recentXid = ReadNextTransactionId(); - xidForceLimit = recentXid - autovacuum_freeze_max_age; - /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */ - /* this can cause the limit to go backwards by 3, but that's OK */ - if (xidForceLimit < FirstNormalTransactionId) - xidForceLimit -= FirstNormalTransactionId; + if (recentXid > FirstNormalTransactionId + autovacuum_freeze_max_age) + xidForceLimit = recentXid - autovacuum_freeze_max_age; + else + xidForceLimit = FirstNormalTransactionId; /* Also determine the oldest datminmxid we will consider. */ recentMulti = ReadNextMultiXactId(); - multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold(); - if (multiForceLimit < FirstMultiXactId) - multiForceLimit -= FirstMultiXactId; + multiMembersThreshold = autovacuum_multixact_freeze_max_age; + if (recentMulti > FirstMultiXactId + multiMembersThreshold) + multiForceLimit = recentMulti - multiMembersThreshold; + else + multiForceLimit = FirstMultiXactId; /* * Choose a database to connect to. We pick the database that was least @@ -1969,7 +1971,7 @@ do_autovacuum(void) BufferAccessStrategy bstrategy; ScanKeyData key; TupleDesc pg_class_desc; - int effective_multixact_freeze_max_age; + int64 effective_multixact_freeze_max_age; bool did_vacuum = false; bool found_concurrent_worker = false; int i; @@ -1992,7 +1994,7 @@ do_autovacuum(void) * normally autovacuum_multixact_freeze_max_age, but may be less if we are * short of multixact member space. */ - effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age; /* * Find the pg_database entry and select the default freeze ages. We use @@ -2758,7 +2760,7 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) static autovac_table * table_recheck_autovac(Oid relid, HTAB *table_toast_map, TupleDesc pg_class_desc, - int effective_multixact_freeze_max_age) + int64 effective_multixact_freeze_max_age) { Form_pg_class classForm; HeapTuple classTup; @@ -2797,10 +2799,10 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, /* OK, it needs something done */ if (doanalyze || dovacuum) { - int freeze_min_age; - int freeze_table_age; - int multixact_freeze_min_age; - int multixact_freeze_table_age; + int64 freeze_min_age; + int64 freeze_table_age; + int64 multixact_freeze_min_age; + int64 multixact_freeze_table_age; int vac_cost_limit; double vac_cost_delay; int log_min_duration; @@ -2905,7 +2907,7 @@ static void recheck_relation_needs_vacanalyze(Oid relid, AutoVacOpts *avopts, Form_pg_class classForm, - int effective_multixact_freeze_max_age, + int64 effective_multixact_freeze_max_age, bool *dovacuum, bool *doanalyze, bool *wraparound) @@ -2967,7 +2969,7 @@ relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, Form_pg_class classForm, PgStat_StatTabEntry *tabentry, - int effective_multixact_freeze_max_age, + int64 effective_multixact_freeze_max_age, /* output params below */ bool *dovacuum, bool *doanalyze, @@ -2996,8 +2998,8 @@ relation_needs_vacanalyze(Oid relid, anltuples; /* freeze parameters */ - int freeze_max_age; - int multixact_freeze_max_age; + int64 freeze_max_age; + int64 multixact_freeze_max_age; TransactionId xidForceLimit; MultiXactId multiForceLimit; @@ -3047,17 +3049,19 @@ relation_needs_vacanalyze(Oid relid, av_enabled = (relopts ? relopts->enabled : true); /* Force vacuum if table is at risk of wraparound */ - xidForceLimit = recentXid - freeze_max_age; - if (xidForceLimit < FirstNormalTransactionId) - xidForceLimit -= FirstNormalTransactionId; + if (recentXid > FirstNormalTransactionId + freeze_max_age) + xidForceLimit = recentXid - freeze_max_age; + else + xidForceLimit = FirstNormalTransactionId; force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) && TransactionIdPrecedes(classForm->relfrozenxid, xidForceLimit)); if (!force_vacuum) { - multiForceLimit = recentMulti - multixact_freeze_max_age; - if (multiForceLimit < FirstMultiXactId) - multiForceLimit -= FirstMultiXactId; + if (recentMulti > FirstMultiXactId + multixact_freeze_max_age) + multiForceLimit = recentMulti - multixact_freeze_max_age; + else + multiForceLimit = FirstMultiXactId; force_vacuum = MultiXactIdIsValid(classForm->relminmxid) && MultiXactIdPrecedes(classForm->relminmxid, multiForceLimit); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 2cc0ac9eb0..dd41f54049 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -847,8 +847,12 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) xl_heap_insert *xlrec; ReorderBufferChange *change; RelFileLocator target_locator; + bool isinit = (XLogRecGetInfo(r) & XLOG_HEAP_INIT_PAGE) != 0; + Pointer rec_data = (Pointer) XLogRecGetData(r); - xlrec = (xl_heap_insert *) XLogRecGetData(r); + if (isinit) + rec_data += sizeof(TransactionId); + xlrec = (xl_heap_insert *) rec_data; /* * Ignore insert records without new tuples (this does happen when @@ -904,8 +908,12 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) ReorderBufferChange *change; char *data; RelFileLocator target_locator; + bool isinit = (XLogRecGetInfo(r) & XLOG_HEAP_INIT_PAGE) != 0; + Pointer rec_data = (Pointer) XLogRecGetData(r); - xlrec = (xl_heap_update *) XLogRecGetData(r); + if (isinit) + rec_data += sizeof(TransactionId); + xlrec = (xl_heap_update *) rec_data; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); @@ -1065,8 +1073,12 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) char *tupledata; Size tuplelen; RelFileLocator rlocator; + bool isinit = (XLogRecGetInfo(r) & XLOG_HEAP_INIT_PAGE) != 0; + Pointer rec_data = (Pointer) XLogRecGetData(r); - xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); + if (isinit) + rec_data += sizeof(TransactionId); + xlrec = (xl_heap_multi_insert *) rec_data; /* * Ignore insert records without new tuples. This happens when a diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index ff8513e2d2..c1c4adad9d 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -64,7 +64,7 @@ logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) /* fixed fields */ pq_sendint64(out, txn->final_lsn); pq_sendint64(out, txn->xact_time.commit_time); - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); } /* @@ -78,7 +78,7 @@ logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) if (begin_data->final_lsn == InvalidXLogRecPtr) elog(ERROR, "final_lsn not set in begin message"); begin_data->committime = pq_getmsgint64(in); - begin_data->xid = pq_getmsgint(in, 4); + begin_data->xid = pq_getmsgint64(in); } @@ -132,7 +132,7 @@ logicalrep_write_begin_prepare(StringInfo out, ReorderBufferTXN *txn) pq_sendint64(out, txn->final_lsn); pq_sendint64(out, txn->end_lsn); pq_sendint64(out, txn->xact_time.prepare_time); - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); /* send gid */ pq_sendstring(out, txn->gid); @@ -152,7 +152,7 @@ logicalrep_read_begin_prepare(StringInfo in, LogicalRepPreparedTxnData *begin_da if (begin_data->end_lsn == InvalidXLogRecPtr) elog(ERROR, "end_lsn not set in begin prepare message"); begin_data->prepare_time = pq_getmsgint64(in); - begin_data->xid = pq_getmsgint(in, 4); + begin_data->xid = pq_getmsgint64(in); /* read gid (copy it into a pre-allocated buffer) */ strlcpy(begin_data->gid, pq_getmsgstring(in), sizeof(begin_data->gid)); @@ -185,7 +185,7 @@ logicalrep_write_prepare_common(StringInfo out, LogicalRepMsgType type, pq_sendint64(out, prepare_lsn); pq_sendint64(out, txn->end_lsn); pq_sendint64(out, txn->xact_time.prepare_time); - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); /* send gid */ pq_sendstring(out, txn->gid); @@ -224,7 +224,7 @@ logicalrep_read_prepare_common(StringInfo in, char *msgtype, if (prepare_data->end_lsn == InvalidXLogRecPtr) elog(ERROR, "end_lsn is not set in %s message", msgtype); prepare_data->prepare_time = pq_getmsgint64(in); - prepare_data->xid = pq_getmsgint(in, 4); + prepare_data->xid = pq_getmsgint64(in); if (prepare_data->xid == InvalidTransactionId) elog(ERROR, "invalid two-phase transaction ID in %s message", msgtype); @@ -265,7 +265,7 @@ logicalrep_write_commit_prepared(StringInfo out, ReorderBufferTXN *txn, pq_sendint64(out, commit_lsn); pq_sendint64(out, txn->end_lsn); pq_sendint64(out, txn->xact_time.commit_time); - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); /* send gid */ pq_sendstring(out, txn->gid); @@ -291,7 +291,7 @@ logicalrep_read_commit_prepared(StringInfo in, LogicalRepCommitPreparedTxnData * if (prepare_data->end_lsn == InvalidXLogRecPtr) elog(ERROR, "end_lsn is not set in commit prepared message"); prepare_data->commit_time = pq_getmsgint64(in); - prepare_data->xid = pq_getmsgint(in, 4); + prepare_data->xid = pq_getmsgint64(in); /* read gid (copy it into a pre-allocated buffer) */ strlcpy(prepare_data->gid, pq_getmsgstring(in), sizeof(prepare_data->gid)); @@ -323,7 +323,7 @@ logicalrep_write_rollback_prepared(StringInfo out, ReorderBufferTXN *txn, pq_sendint64(out, txn->end_lsn); pq_sendint64(out, prepare_time); pq_sendint64(out, txn->xact_time.commit_time); - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); /* send gid */ pq_sendstring(out, txn->gid); @@ -351,7 +351,7 @@ logicalrep_read_rollback_prepared(StringInfo in, elog(ERROR, "rollback_end_lsn is not set in rollback prepared message"); rollback_data->prepare_time = pq_getmsgint64(in); rollback_data->rollback_time = pq_getmsgint64(in); - rollback_data->xid = pq_getmsgint(in, 4); + rollback_data->xid = pq_getmsgint64(in); /* read gid (copy it into a pre-allocated buffer) */ strlcpy(rollback_data->gid, pq_getmsgstring(in), sizeof(rollback_data->gid)); @@ -418,7 +418,7 @@ logicalrep_write_insert(StringInfo out, TransactionId xid, Relation rel, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -467,7 +467,7 @@ logicalrep_write_update(StringInfo out, TransactionId xid, Relation rel, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -541,7 +541,7 @@ logicalrep_write_delete(StringInfo out, TransactionId xid, Relation rel, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -595,7 +595,7 @@ logicalrep_write_truncate(StringInfo out, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); pq_sendint32(out, nrelids); @@ -653,7 +653,7 @@ logicalrep_write_message(StringInfo out, TransactionId xid, XLogRecPtr lsn, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); pq_sendint8(out, flags); pq_sendint64(out, lsn); @@ -675,7 +675,7 @@ logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel, /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); /* use Oid as relation identifier */ pq_sendint32(out, RelationGetRelid(rel)); @@ -731,7 +731,7 @@ logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid) /* transaction ID (if not valid, we're not streaming) */ if (TransactionIdIsValid(xid)) - pq_sendint32(out, xid); + pq_sendint64(out, xid); tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(basetypoid)); if (!HeapTupleIsValid(tup)) @@ -1079,7 +1079,7 @@ logicalrep_write_stream_start(StringInfo out, Assert(TransactionIdIsValid(xid)); /* transaction ID (we're starting to stream, so must be valid) */ - pq_sendint32(out, xid); + pq_sendint64(out, xid); /* 1 if this is the first streaming segment for this xid */ pq_sendbyte(out, first_segment ? 1 : 0); @@ -1095,7 +1095,7 @@ logicalrep_read_stream_start(StringInfo in, bool *first_segment) Assert(first_segment); - xid = pq_getmsgint(in, 4); + xid = pq_getmsgint64(in); *first_segment = (pq_getmsgbyte(in) == 1); return xid; @@ -1124,7 +1124,7 @@ logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, Assert(TransactionIdIsValid(txn->xid)); /* transaction ID */ - pq_sendint32(out, txn->xid); + pq_sendint64(out, txn->xid); /* send the flags field (unused for now) */ pq_sendbyte(out, flags); @@ -1144,7 +1144,7 @@ logicalrep_read_stream_commit(StringInfo in, LogicalRepCommitData *commit_data) TransactionId xid; uint8 flags; - xid = pq_getmsgint(in, 4); + xid = pq_getmsgint64(in); /* read flags (unused for now) */ flags = pq_getmsgbyte(in); @@ -1173,8 +1173,8 @@ logicalrep_write_stream_abort(StringInfo out, TransactionId xid, Assert(TransactionIdIsValid(xid) && TransactionIdIsValid(subxid)); /* transaction ID */ - pq_sendint32(out, xid); - pq_sendint32(out, subxid); + pq_sendint64(out, xid); + pq_sendint64(out, subxid); } /* @@ -1186,8 +1186,8 @@ logicalrep_read_stream_abort(StringInfo in, TransactionId *xid, { Assert(xid && subxid); - *xid = pq_getmsgint(in, 4); - *subxid = pq_getmsgint(in, 4); + *xid = pq_getmsgint64(in); + *subxid = pq_getmsgint64(in); } /* diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 9b20e158eb..e0259aaa33 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -5094,8 +5094,12 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) TransactionId f_mapped_xid; TransactionId f_create_xid; XLogRecPtr f_lsn; - uint32 f_hi, - f_lo; + uint32 f_lsn_hi, + f_lsn_lo, + f_mapped_xid_hi, + f_mapped_xid_lo, + f_create_xid_hi, + f_create_xid_lo; RewriteMappingFile *f; if (strcmp(mapping_de->d_name, ".") == 0 || @@ -5107,11 +5111,14 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) continue; if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, - &f_dboid, &f_relid, &f_hi, &f_lo, - &f_mapped_xid, &f_create_xid) != 6) + &f_dboid, &f_relid, &f_lsn_hi, &f_lsn_lo, + &f_mapped_xid_hi, &f_mapped_xid_lo, + &f_create_xid_hi, &f_create_xid_lo) != 8) elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); - f_lsn = ((uint64) f_hi) << 32 | f_lo; + f_lsn = ((uint64) f_lsn_hi) << 32 | f_lsn_lo; + f_mapped_xid = ((uint64) f_mapped_xid_hi) << 32 | f_mapped_xid_lo; + f_create_xid = ((uint64) f_create_xid_hi) << 32 | f_create_xid_lo; /* mapping for another database */ if (f_dboid != dboid) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index d518746ddd..f0adcb4dd1 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -579,7 +579,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); /* so we don't overwrite the existing value */ - if (TransactionIdIsValid(MyProc->xmin)) + if (TransactionIdIsValid(pg_atomic_read_u64(&MyProc->xmin))) elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid"); snap = SnapBuildBuildSnapshot(builder); @@ -601,7 +601,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) } #endif - MyProc->xmin = snap->xmin; + pg_atomic_write_u64(&MyProc->xmin, snap->xmin); /* allocate in transaction context */ newxip = (TransactionId *) @@ -999,9 +999,10 @@ SnapBuildPurgeOlderTxn(SnapBuild *builder) builder->catchange.xip = NULL; } - elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u", + elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %llu, xmax: %llu", (uint32) builder->catchange.xcnt, (uint32) surviving_xids, - builder->xmin, builder->xmax); + (unsigned long long) builder->xmin, + (unsigned long long) builder->xmax); builder->catchange.xcnt = surviving_xids; } } diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index e62de43e09..4e01565fca 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -450,7 +450,7 @@ handle_streamed_transaction(LogicalRepMsgType action, StringInfo s) * We should have received XID of the subxact as the first part of the * message, so extract it. */ - xid = pq_getmsgint(s, 4); + xid = pq_getmsgint64(s); if (!TransactionIdIsValid(xid)) ereport(ERROR, diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 2ecaa5b907..e7d2593cef 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -584,7 +584,8 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, if (!sent_begin_txn) { - elog(DEBUG1, "skipped replication of an empty transaction with XID: %u", txn->xid); + elog(DEBUG1, "skipped replication of an empty transaction with XID: %llu", + (unsigned long long) txn->xid); return; } diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 927fd2dbe5..d53bc09a14 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1143,10 +1143,6 @@ static void XLogWalRcvSendHSFeedback(bool immed) { TimestampTz now; - FullTransactionId nextFullXid; - TransactionId nextXid; - uint32 xmin_epoch, - catalog_xmin_epoch; TransactionId xmin, catalog_xmin; static TimestampTz sendTime = 0; @@ -1203,31 +1199,15 @@ XLogWalRcvSendHSFeedback(bool immed) catalog_xmin = InvalidTransactionId; } - /* - * Get epoch and adjust if nextXid and oldestXmin are different sides of - * the epoch boundary. - */ - nextFullXid = ReadNextFullTransactionId(); - nextXid = XidFromFullTransactionId(nextFullXid); - xmin_epoch = EpochFromFullTransactionId(nextFullXid); - catalog_xmin_epoch = xmin_epoch; - if (nextXid < xmin) - xmin_epoch--; - if (nextXid < catalog_xmin) - catalog_xmin_epoch--; - - elog(DEBUG2, "sending hot standby feedback xmin %llu epoch %u catalog_xmin %llu catalog_xmin_epoch %u", - (unsigned long long) xmin, xmin_epoch, - (unsigned long long) catalog_xmin, catalog_xmin_epoch); + elog(DEBUG2, "sending hot standby feedback xmin %llu catalog_xmin %llu", + (unsigned long long) xmin, (unsigned long long) catalog_xmin); /* Construct the message and send it. */ resetStringInfo(&reply_message); pq_sendbyte(&reply_message, 'h'); pq_sendint64(&reply_message, GetCurrentTimestamp()); - pq_sendint32(&reply_message, xmin); - pq_sendint32(&reply_message, xmin_epoch); - pq_sendint32(&reply_message, catalog_xmin); - pq_sendint32(&reply_message, catalog_xmin_epoch); + pq_sendint64(&reply_message, xmin); + pq_sendint64(&reply_message, catalog_xmin); walrcv_send(wrconn, reply_message.data, reply_message.len); if (TransactionIdIsValid(xmin) || TransactionIdIsValid(catalog_xmin)) primary_has_standby_xmin = true; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 66cbec488c..d0a9bbb6c1 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -255,7 +255,6 @@ static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, Tr static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); -static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); @@ -293,7 +292,7 @@ InitWalSender(void) */ if (MyDatabaseId == InvalidOid) { - Assert(MyProc->xmin == InvalidTransactionId); + Assert(pg_atomic_read_u64(&MyProc->xmin) == InvalidTransactionId); LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyProc->statusFlags |= PROC_AFFECTS_ALL_HORIZONS; ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; @@ -2166,7 +2165,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbac ReplicationSlot *slot = MyReplicationSlot; SpinLockAcquire(&slot->mutex); - MyProc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&MyProc->xmin, InvalidTransactionId); /* * For physical replication we don't need the interlock provided by xmin @@ -2198,44 +2197,6 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbac } } -/* - * Check that the provided xmin/epoch are sane, that is, not in the future - * and not so far back as to be already wrapped around. - * - * Epoch of nextXid should be same as standby, or if the counter has - * wrapped, then one greater than standby. - * - * This check doesn't care about whether clog exists for these xids - * at all. - */ -static bool -TransactionIdInRecentPast(TransactionId xid, uint32 epoch) -{ - FullTransactionId nextFullXid; - TransactionId nextXid; - uint32 nextEpoch; - - nextFullXid = ReadNextFullTransactionId(); - nextXid = XidFromFullTransactionId(nextFullXid); - nextEpoch = EpochFromFullTransactionId(nextFullXid); - - if (xid <= nextXid) - { - if (epoch != nextEpoch) - return false; - } - else - { - if (epoch + 1 != nextEpoch) - return false; - } - - if (!TransactionIdPrecedesOrEquals(xid, nextXid)) - return false; /* epoch OK, but it's wrapped around */ - - return true; -} - /* * Hot Standby feedback */ @@ -2243,9 +2204,7 @@ static void ProcessStandbyHSFeedbackMessage(void) { TransactionId feedbackXmin; - uint32 feedbackEpoch; TransactionId feedbackCatalogXmin; - uint32 feedbackCatalogEpoch; TimestampTz replyTime; /* @@ -2254,10 +2213,8 @@ ProcessStandbyHSFeedbackMessage(void) * of this message. */ replyTime = pq_getmsgint64(&reply_message); - feedbackXmin = pq_getmsgint(&reply_message, 4); - feedbackEpoch = pq_getmsgint(&reply_message, 4); - feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); - feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + feedbackXmin = pq_getmsgint64(&reply_message); + feedbackCatalogXmin = pq_getmsgint64(&reply_message); if (message_level_is_interesting(DEBUG2)) { @@ -2266,11 +2223,9 @@ ProcessStandbyHSFeedbackMessage(void) /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); - elog(DEBUG2, "hot standby feedback xmin %llu epoch %u, catalog_xmin %llu epoch %u reply_time %s", + elog(DEBUG2, "hot standby feedback xmin %llu, catalog_xmin %llu reply_time %s", (unsigned long long) feedbackXmin, - feedbackEpoch, (unsigned long long) feedbackCatalogXmin, - feedbackCatalogEpoch, replyTimeStr); pfree(replyTimeStr); @@ -2295,24 +2250,12 @@ ProcessStandbyHSFeedbackMessage(void) if (!TransactionIdIsNormal(feedbackXmin) && !TransactionIdIsNormal(feedbackCatalogXmin)) { - MyProc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&MyProc->xmin, InvalidTransactionId); if (MyReplicationSlot != NULL) PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); return; } - /* - * Check that the provided xmin/epoch are sane, that is, not in the future - * and not so far back as to be already wrapped around. Ignore if not. - */ - if (TransactionIdIsNormal(feedbackXmin) && - !TransactionIdInRecentPast(feedbackXmin, feedbackEpoch)) - return; - - if (TransactionIdIsNormal(feedbackCatalogXmin) && - !TransactionIdInRecentPast(feedbackCatalogXmin, feedbackCatalogEpoch)) - return; - /* * Set the WalSender's xmin equal to the standby's requested xmin, so that * the xmin will be taken into account by GetSnapshotData() / @@ -2350,9 +2293,9 @@ ProcessStandbyHSFeedbackMessage(void) { if (TransactionIdIsNormal(feedbackCatalogXmin) && TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin)) - MyProc->xmin = feedbackCatalogXmin; + pg_atomic_write_u64(&MyProc->xmin, feedbackCatalogXmin); else - MyProc->xmin = feedbackXmin; + pg_atomic_write_u64(&MyProc->xmin, feedbackXmin); } } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index ab97e71dd7..118ff24d1c 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -2481,6 +2481,7 @@ statext_expressions_load(Oid stxoid, bool inh, int idx) ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; tmptup.t_data = td; + HeapTupleCopyHeaderXids(&tmptup); tup = heap_copytuple(&tmptup); diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index fd7c40dcb0..ffcc0fc290 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -17,6 +17,7 @@ OBJS = \ buf_table.o \ bufmgr.o \ freelist.o \ - localbuf.o + localbuf.o \ + heap_convert.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6b95381481..ba5611b008 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -459,7 +459,8 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) ) -static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, +static Buffer ReadBuffer_common(Relation reln, + SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit); @@ -777,7 +778,8 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * miss. */ pgstat_count_buffer_read(reln); - buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence, + buf = ReadBuffer_common(reln, + RelationGetSmgr(reln), reln->rd_rel->relpersistence, forkNum, blockNum, mode, strategy, &hit); if (hit) pgstat_count_buffer_hit(reln); @@ -804,7 +806,7 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, SMgrRelation smgr = smgropen(rlocator, InvalidBackendId); - return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : + return ReadBuffer_common(NULL, smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, mode, strategy, &hit); } @@ -816,7 +818,8 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, * *hit is set to true if the request was satisfied from shared buffer cache. */ static Buffer -ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, +ReadBuffer_common(Relation reln, + SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit) { @@ -1048,6 +1051,30 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, blockNum, relpath(smgr->smgr_rlocator, forkNum)))); } + + if (PageGetPageLayoutVersion(bufBlock) != PG_PAGE_LAYOUT_VERSION && + !PageIsNew((Page) bufBlock)) + { + Buffer buf = BufferDescriptorGetBuffer(bufHdr); + + /* + * All the forks but MAIN_FORKNUM should be converted to the + * actual page layout version in pg_upgrade. + */ + if (forkNum != MAIN_FORKNUM) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid fork type (%d) in block %u of relation %s", + forkNum, blockNum, + relpath(smgr->smgr_rlocator, forkNum)))); + + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + /* Check for no concurrent changes */ + if (PageGetPageLayoutVersion(bufBlock) != PG_PAGE_LAYOUT_VERSION) + convert_page(reln, bufBlock, buf, blockNum); + + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + } } } @@ -4131,6 +4158,64 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) } } +/* + * Mark buffer as converted - ie its format is changed without logical changes. + * + * It will override `full_page_write` GUC setting in XLogRecordAssemble. + */ +void +MarkBufferConverted(Buffer buffer, bool converted) +{ + BufferDesc *bufHdr; + uint32 buf_state; + bool has_mark; + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + Assert(!BufferIsLocal(buffer)); + + bufHdr = GetBufferDescriptor(buffer - 1); + + Assert(GetPrivateRefCount(buffer) > 0); + if (converted) + { + /* here, either share or exclusive lock is OK */ + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + } + + buf_state = pg_atomic_read_u32(&bufHdr->state); + has_mark = (buf_state & BM_CONVERTED) != 0; + if (converted == has_mark) + return; + + buf_state = LockBufHdr(bufHdr); + buf_state &= ~BM_CONVERTED; + if (converted) + buf_state |= BM_CONVERTED; + UnlockBufHdr(bufHdr, buf_state); +} + +bool +IsBufferConverted(Buffer buffer) +{ + + BufferDesc *bufHdr; + uint32 buf_state; + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + Assert(!BufferIsLocal(buffer)); + + bufHdr = GetBufferDescriptor(buffer - 1); + + Assert(GetPrivateRefCount(buffer) > 0); + + buf_state = pg_atomic_read_u32(&bufHdr->state); + return (buf_state & BM_CONVERTED) != 0; +} + /* * Release buffer content locks for shared buffers. * @@ -4165,6 +4250,47 @@ UnlockBuffers(void) } } +/* + * Is shared buffer is locked? + */ +bool +IsBufferLocked(Buffer buffer) +{ + BufferDesc *buf; + + if (buffer == InvalidBuffer) + return true; + + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + return true; /* local buffers need no lock */ + + buf = GetBufferDescriptor(buffer - 1); + + return LWLockHeldByMe(BufferDescriptorGetContentLock(buf)); +} + +/* + * Is shared buffer is locked exclusive? + */ +bool +IsBufferLockedExclusive(Buffer buffer) +{ + BufferDesc *buf; + + if (buffer == InvalidBuffer) + return true; + + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + return true; /* local buffers need no lock */ + + buf = GetBufferDescriptor(buffer - 1); + + return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE); +} + /* * Acquire or release the content_lock for the buffer. */ diff --git a/src/backend/storage/buffer/heap_convert.c b/src/backend/storage/buffer/heap_convert.c new file mode 100644 index 0000000000..e6abac9760 --- /dev/null +++ b/src/backend/storage/buffer/heap_convert.c @@ -0,0 +1,546 @@ +/*------------------------------------------------------------------------- + * + * heap_convert.c + * Heap page converter from 32bit to 64bit xid format + * + * Copyright (c) 2015-2022, Postgres Professional + * + * IDENTIFICATION + * src/backend/storage/buffer/heap_convert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/generic_xlog.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "catalog/catalog.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" + +static void repack_heap_tuples(Relation rel, Page page, Buffer buf, + BlockNumber blkno, bool double_xmax); + +/* + * itemoffcompare + * Sorting support for repack_tuples() + */ +int +itemoffcompare(const void *item1, const void *item2) +{ + /* Sort in decreasing itemoff order */ + return ((ItemIdCompactData *) item2)->itemoff - + ((ItemIdCompactData *) item1)->itemoff; +} + +/* + * Lazy page conversion from 32-bit to 64-bit XID at first read. + */ +void +convert_page(Relation rel, Page page, Buffer buf, BlockNumber blkno) +{ + static unsigned logcnt = 0; + bool logit; + PageHeader hdr = (PageHeader) page; + GenericXLogState *state = NULL; + uint16 checksum; + bool try_double_xmax; + + /* Not during XLog replaying */ + Assert(rel != NULL); + + /* Verify checksum */ + if (hdr->pd_checksum) + { + checksum = pg_checksum_page((char *) page, blkno); + if (checksum != hdr->pd_checksum) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, hdr->pd_checksum))); + } + + /* + * We occasionally force logging of page conversion, so never-changed + * pages are converted in the end. FORCE_LOG_EVERY is chosen arbitrarily + * to log neither too much nor too little. + */ +#define FORCE_LOG_EVERY 128 + logit = !RecoveryInProgress() && XLogIsNeeded() && RelationNeedsWAL(rel); + logit = logit && (++logcnt % FORCE_LOG_EVERY) == 0; + if (logit) + { + state = GenericXLogStart(rel); + page = GenericXLogRegisterBuffer(state, buf, + GENERIC_XLOG_FULL_IMAGE); + hdr = (PageHeader) page; + } +#ifdef USE_ASSERT_CHECKING + else + { + /* Not already converted */ + Assert(PageGetPageLayoutVersion(page) != PG_PAGE_LAYOUT_VERSION); + /* Page in 32-bit xid format should not have PageSpecial. */ + Assert(PageGetSpecialSize(page) == 0); + } +#endif + + switch (rel->rd_rel->relkind) + { + case 't': + try_double_xmax = hdr->pd_upper - hdr->pd_lower < + MAXALIGN(sizeof(ToastPageSpecialData)); + repack_heap_tuples(rel, page, buf, blkno, try_double_xmax); + break; + case 'r': + case 'p': + case 'm': + try_double_xmax = hdr->pd_upper - hdr->pd_lower < + MAXALIGN(sizeof(HeapPageSpecialData)); + repack_heap_tuples(rel, page, buf, blkno, try_double_xmax); + break; + case 'i': + /* no need to convert index */ + case 'S': + /* no real need to convert sequences */ + break; + default: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("conversion for relation \"%s\" cannot be done", + RelationGetRelationName(rel)), + errdetail_relkind_not_supported(rel->rd_rel->relkind))); + } + + hdr->pd_checksum = pg_checksum_page((char *) page, blkno); + + PageSetPageSizeAndVersion(page, PageGetPageSize(page), + PG_PAGE_LAYOUT_VERSION); + + if (logit) + { + /* + * Finish logging buffer conversion and mark buffer as dirty. + */ + Assert(state != NULL); + MarkBufferDirty(buf); + GenericXLogFinish(state); + } + else + { + /* + * Otherwise, it will be logged with full-page-write record on first + * actual change. + */ + MarkBufferConverted(buf, true); + } +} + +/* + * Convert xmin and xmax in a tuple. + * This also considers special cases: "double xmax" page format and multixact + * in xmax. + */ +static void +convert_heap_tuple_xids(HeapTupleHeader tuple, TransactionId xid_base, + MultiXactId multi_base, bool double_xmax) +{ + /* Convert xmin */ + if (double_xmax) + { + /* Prepare tuple for "double xmax" page format */ + tuple->t_infomask |= HEAP_XMIN_FROZEN; + tuple->t_choice.t_heap.t_xmin = 0; + } + else + { + TransactionId xmin = tuple->t_choice.t_heap.t_xmin; + + if (TransactionIdIsNormal(xmin)) + { + if (HeapTupleHeaderXminFrozen(tuple)) + tuple->t_choice.t_heap.t_xmin = FrozenTransactionId; + else if (HeapTupleHeaderXminInvalid(tuple)) + tuple->t_choice.t_heap.t_xmin = InvalidTransactionId; + else + { + Assert(xmin >= xid_base + FirstNormalTransactionId); + /* Subtract xid_base from normal xmin */ + tuple->t_choice.t_heap.t_xmin = xmin - xid_base; + } + } + } + + /* If tuple has multixact flag, handle mxid wraparound */ + if ((tuple->t_infomask & HEAP_XMAX_IS_MULTI) && + !(tuple->t_infomask & HEAP_XMAX_INVALID)) + { + MultiXactId mxid = tuple->t_choice.t_heap.t_xmax; + + /* Handle mxid wraparound */ + if (mxid < multi_base) + { + mxid += ((MultiXactId) 1 << 32) - FirstMultiXactId; + Assert(mxid >= multi_base); + } + + if (double_xmax) + { + /* Save converted mxid into "double xmax" format */ + HeapTupleHeaderSetDoubleXmax(tuple, mxid); + } + else + { + /* + * Save converted mxid offset relative to (minmxid - 1), which + * will be page's mxid base. + */ + Assert(mxid - multi_base + FirstMultiXactId <= PG_UINT32_MAX); + tuple->t_choice.t_heap.t_xmax = + (uint32) (mxid - multi_base + FirstMultiXactId); + } + } + /* Convert xmax */ + else if (!(tuple->t_infomask & HEAP_XMAX_INVALID)) + { + TransactionId xmax = tuple->t_choice.t_heap.t_xmax; + + if (double_xmax) + { + /* Save converted xmax into "double xmax" format */ + HeapTupleHeaderSetDoubleXmax(tuple, xmax); + } + else if (TransactionIdIsNormal(xmax)) + { + /* Subtract xid_base from normal xmax */ + Assert(xmax >= xid_base + FirstNormalTransactionId); + tuple->t_choice.t_heap.t_xmax = xmax - xid_base; + } + } + else + { + if (double_xmax) + HeapTupleHeaderSetDoubleXmax(tuple, InvalidTransactionId); + else + tuple->t_choice.t_heap.t_xmax = InvalidTransactionId; + } +} + +/* + * Correct page xmin/xmax based on tuple xmin/xmax values. + */ +static void +compute_xid_min_max(HeapTuple tuple, MultiXactId multi_base, + TransactionId *xid_min, TransactionId *xid_max, + MultiXactId *multi_min, MultiXactId *multi_max) +{ + /* xmin */ + if (!HeapTupleHeaderXminInvalid(tuple->t_data) && + !HeapTupleHeaderXminFrozen(tuple->t_data)) + { + TransactionId xid = HeapTupleGetRawXmin(tuple); + + if (TransactionIdIsNormal(xid)) + { + *xid_max = Max(*xid_max, xid); + *xid_min = Min(*xid_min, xid); + } + } + + /* xmax */ + if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)) + { + TransactionId xid; + + if (tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) + { + MultiXactId mxid = HeapTupleGetRawXmax(tuple); + + Assert(MultiXactIdIsValid(mxid)); + + /* Handle mxid wraparound */ + if (mxid < multi_base) + { + mxid += ((MultiXactId) 1 << 32) - FirstMultiXactId; + Assert(mxid >= multi_base); + } + + *multi_max = Max(*multi_max, mxid); + *multi_min = Min(*multi_min, mxid); + + /* + * Also take into account hidden update xid, which can be + * extracted by the vacuum. + */ + if (tuple->t_data->t_infomask & HEAP_XMAX_LOCK_ONLY) + xid = InvalidTransactionId; + else + xid = HeapTupleGetUpdateXid(tuple); + } + else + { + xid = HeapTupleGetRawXmax(tuple); + } + + if (TransactionIdIsNormal(xid)) + { + *xid_max = Max(*xid_max, xid); + *xid_min = Min(*xid_min, xid); + } + } +} + +/* + * Returns true if both: + * - xid_max: an uppper boundary of xmin's and xmax'es of all tuples on a page + * - xid_min: a lower boundary of xmin's and xmax'es of all tuples on a page + * can be expressed by 32-bit number relative to page's xid_base/multi_base + * or invalid. + * + * True value effectively means that these tuples can be directly put on one + * page in 64-xid format. + */ +static inline bool +xids_fit_page(TransactionId xid_min, TransactionId xid_max, + MultiXactId multi_min, MultiXactId multi_max) +{ + bool xid_max_fits = false; + bool multi_max_fits = false; + + if (xid_max == InvalidTransactionId) + xid_max_fits = true; + + if (xid_max - xid_min <= MaxShortTransactionId - FirstNormalTransactionId) + xid_max_fits = true; + + if (multi_max == InvalidMultiXactId) + multi_max_fits = true; + + if (multi_max - multi_min <= MaxShortTransactionId - FirstMultiXactId) + multi_max_fits = true; + + return xid_max_fits && multi_max_fits; +} + +/* + * Set "base" for page in 64-bit XID format. + * + * This should not be called for double xmax pages. They do not have place for + * page special. + */ +static inline void +heap_page_set_base(Page page, + TransactionId xid_min, TransactionId xid_max, + MultiXactId multi_min, MultiXactId multi_max, + TransactionId *xid_base, MultiXactId *multi_base, + bool is_toast) +{ + PageHeader hdr = (PageHeader) page; + + if (xid_max != InvalidTransactionId) + *xid_base = xid_min - FirstNormalTransactionId; + else + *xid_base = InvalidTransactionId; + + if (multi_max != InvalidMultiXactId) + *multi_base = multi_min - FirstMultiXactId; + else + *multi_base = InvalidMultiXactId; + + if (is_toast) + { + ToastPageSpecial special; + + hdr->pd_special = BLCKSZ - MAXALIGN(sizeof(ToastPageSpecialData)); + special = ToastPageGetSpecial(page); + special->pd_xid_base = *xid_base; + } + else + { + HeapPageSpecial special; + + hdr->pd_special = BLCKSZ - MAXALIGN(sizeof(HeapPageSpecialData)); + special = HeapPageGetSpecial(page); + special->pd_xid_base = *xid_base; + special->pd_multi_base = *multi_base; + } +} + +/* + * repack_heap_tuples + * Convert heap page format reusing space of dead tuples + */ +static void +repack_heap_tuples(Relation rel, Page page, Buffer buf, BlockNumber blkno, + bool try_double_xmax) +{ + ItemIdCompactData items[MaxHeapTuplesPerPage]; + ItemIdCompact itemPtr = items; + int nitems = 0, + maxoff = PageGetMaxOffsetNumber(page), + idx, + occupied_space = 0; + Offset upper; + bool double_xmax, + special_fits, + toast; + PageHeader hdr = (PageHeader) page, + new_hdr; + char new_page[BLCKSZ] = {0}; + MultiXactId multi_base = rel->rd_rel->relminmxid, + multi_min = MaxMultiXactId, + multi_max = InvalidMultiXactId; + TransactionId xid_base = rel->rd_rel->relfrozenxid, + xid_min = MaxTransactionId, + xid_max = InvalidTransactionId; + + toast = IsToastRelation(rel); + + if (TransactionIdIsNormal(hdr->pd_prune_xid)) + xid_min = xid_max = hdr->pd_prune_xid; + + for (idx = 0; idx < maxoff; idx++) + { + HeapTupleData tuple; + ItemId lp; + + lp = PageGetItemId(page, idx + 1); + + /* Skip redirects and items without storage */ + if (!ItemIdHasStorage(lp)) + continue; + + /* Build in-memory tuple representation */ + tuple.t_tableOid = 1; /* doesn't matter in this case */ + tuple.t_data = (HeapTupleHeader) PageGetItem(page, lp); + HeapTupleCopyHeaderXids(&tuple); + tuple.t_len = ItemIdGetLength(lp); + ItemPointerSet(&(tuple.t_self), blkno, ItemIdGetOffset(lp)); + + /* + * This is only needed to determine whether tuple is HEAPTUPLE_DEAD or + * HEAPTUPLE_RECENTLY_DEAD. And since this is the first time we read + * page after pg_upgrade, it cannot be HEAPTUPLE_RECENTLY_DEAD. See + * HeapTupleSatisfiesVacuum() for details + */ + if (try_double_xmax && + HeapTupleSatisfiesVacuum(&tuple, + (TransactionId) 1 << 32, buf) == HEAPTUPLE_DEAD) + { + ItemIdSetDead(lp); + } + + if (ItemIdIsNormal(lp) && ItemIdHasStorage(lp)) + { + itemPtr->offsetindex = idx; + itemPtr->itemoff = ItemIdGetOffset(lp); + if (unlikely(itemPtr->itemoff < hdr->pd_upper || + itemPtr->itemoff >= hdr->pd_special)) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item pointer: %u", + itemPtr->itemoff))); + } + + itemPtr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); + occupied_space += itemPtr->alignedlen; + nitems++; + itemPtr++; + if (try_double_xmax) + { + HeapTupleSetXmin(&tuple, FrozenTransactionId); + HeapTupleHeaderSetXminFrozen(tuple.t_data); + } + + compute_xid_min_max(&tuple, multi_base, + &xid_min, &xid_max, + &multi_min, &multi_max); + } + } + + /* Write new header */ + new_hdr = (PageHeader) new_page; + *new_hdr = *hdr; + new_hdr->pd_lower = SizeOfPageHeaderData + maxoff * sizeof(ItemIdData); + + if (toast) + special_fits = BLCKSZ - new_hdr->pd_lower - occupied_space >= + sizeof(ToastPageSpecialData); + else + special_fits = BLCKSZ - new_hdr->pd_lower - occupied_space >= + sizeof(HeapPageSpecialData); + + double_xmax = !special_fits || + !xids_fit_page(xid_min, xid_max, multi_min, multi_max); + + if (!double_xmax) + { + Assert(xid_max == InvalidTransactionId || xid_max >= xid_min); + Assert(multi_max == InvalidMultiXactId || multi_max >= multi_min); + + heap_page_set_base(new_page, + xid_min, xid_max, + multi_min, multi_max, + &xid_base, &multi_base, + toast); + + HeapPageSetPruneXid(new_page, new_hdr->pd_prune_xid, toast); + } + else + { + /* No space for special area, switch to "double xmax" format */ + elog(DEBUG2, "convert heap page %u of relation \"%s\" to double xmax format", + blkno, RelationGetRelationName(rel)); + + if (try_double_xmax) + { + xid_base = InvalidTransactionId; + multi_base = InvalidMultiXactId; + } + else + { + repack_heap_tuples(rel, page, buf, blkno, true); + return; + } + } + + /* Copy ItemIds with an offset */ + memcpy((char *) new_page + SizeOfPageHeaderData, + (char *) page + SizeOfPageHeaderData, + hdr->pd_lower - SizeOfPageHeaderData); + + /* Move live tuples */ + upper = new_hdr->pd_special; + for (idx = 0; idx < nitems; idx++) + { + HeapTupleHeader tuple; + ItemId lp; + + itemPtr = &items[idx]; + lp = PageGetItemId(new_page, itemPtr->offsetindex + 1); + upper -= itemPtr->alignedlen; + occupied_space -= itemPtr->alignedlen; + + memcpy((char *) new_page + upper, + (char *) page + itemPtr->itemoff, + itemPtr->alignedlen); + + tuple = (HeapTupleHeader) (((char *) new_page) + upper); + + convert_heap_tuple_xids(tuple, xid_base, multi_base, double_xmax); + + lp->lp_off = upper; + } + + Assert(occupied_space == 0); + + new_hdr->pd_upper = upper; + if (new_hdr->pd_lower > new_hdr->pd_upper) + elog(ERROR, "cannot convert block %u of relation \"%s\"", + blkno, RelationGetRelationName(rel)); + + memcpy(page, new_page, BLCKSZ); +} diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build index 56a59b5248..a099145872 100644 --- a/src/backend/storage/buffer/meson.build +++ b/src/backend/storage/buffer/meson.build @@ -3,5 +3,6 @@ backend_sources += files( 'buf_table.c', 'bufmgr.c', 'freelist.c', + 'heap_convert.c', 'localbuf.c', ) diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 67c159a492..47589ec53b 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -67,7 +67,7 @@ #include "utils/rel.h" #include "utils/snapmgr.h" -#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) +#define UINT64_ACCESS_ONCE(var) ((uint64)(*((volatile uint64 *)&(var)))) /* Our shared memory area */ typedef struct ProcArrayStruct @@ -356,9 +356,6 @@ static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId l static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); static void MaintainLatestCompletedXid(TransactionId latestXid); static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); - -static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, - TransactionId xid); static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); /* @@ -517,7 +514,8 @@ ProcArrayAdd(PGPROC *proc) arrayP->pgprocnos[index] = proc->pgprocno; proc->pgxactoff = index; - ProcGlobal->xids[index] = proc->xid; + pg_atomic_write_u64(&ProcGlobal->xids[index], + pg_atomic_read_u64(&proc->xid)); ProcGlobal->subxidStates[index] = proc->subxidStatus; ProcGlobal->statusFlags[index] = proc->statusFlags; @@ -577,7 +575,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) if (TransactionIdIsValid(latestXid)) { - Assert(TransactionIdIsValid(ProcGlobal->xids[myoff])); + Assert(TransactionIdIsValid(pg_atomic_read_u64(&ProcGlobal->xids[myoff]))); /* Advance global latestCompletedXid while holding the lock */ MaintainLatestCompletedXid(latestXid); @@ -585,17 +583,17 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) /* Same with xactCompletionCount */ ShmemVariableCache->xactCompletionCount++; - ProcGlobal->xids[myoff] = InvalidTransactionId; + pg_atomic_write_u64(&ProcGlobal->xids[myoff], InvalidTransactionId); ProcGlobal->subxidStates[myoff].overflowed = false; ProcGlobal->subxidStates[myoff].count = 0; } else { /* Shouldn't be trying to remove a live transaction here */ - Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + Assert(!TransactionIdIsValid(pg_atomic_read_u64(&(ProcGlobal->xids[myoff])))); } - Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + Assert(!TransactionIdIsValid(pg_atomic_read_u64(&(ProcGlobal->xids[myoff])))); Assert(ProcGlobal->subxidStates[myoff].count == 0); Assert(ProcGlobal->subxidStates[myoff].overflowed == false); @@ -641,7 +639,6 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) LWLockRelease(ProcArrayLock); } - /* * ProcArrayEndTransaction -- mark a transaction as no longer running * @@ -666,7 +663,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * else is taking a snapshot. See discussion in * src/backend/access/transam/README. */ - Assert(TransactionIdIsValid(proc->xid)); + Assert(TransactionIdIsValid(pg_atomic_read_u64(&proc->xid))); /* * If we can immediately acquire ProcArrayLock, we clear our own XID @@ -688,12 +685,12 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * anyone else's calculation of a snapshot. We might change their * estimate of global xmin, but that's OK. */ - Assert(!TransactionIdIsValid(proc->xid)); + Assert(!TransactionIdIsValid(pg_atomic_read_u64(&proc->xid))); Assert(proc->subxidStatus.count == 0); Assert(!proc->subxidStatus.overflowed); proc->lxid = InvalidLocalTransactionId; - proc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&proc->xmin, InvalidTransactionId); /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; @@ -729,13 +726,14 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) * processes' PGPROC entries. */ Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE)); - Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff])); - Assert(ProcGlobal->xids[pgxactoff] == proc->xid); + Assert(TransactionIdIsValid(pg_atomic_read_u64(&ProcGlobal->xids[pgxactoff]))); + Assert(pg_atomic_read_u64(&ProcGlobal->xids[pgxactoff]) == + pg_atomic_read_u64(&proc->xid)); - ProcGlobal->xids[pgxactoff] = InvalidTransactionId; - proc->xid = InvalidTransactionId; + pg_atomic_write_u64(&ProcGlobal->xids[pgxactoff], InvalidTransactionId); + pg_atomic_write_u64(&proc->xid, InvalidTransactionId); proc->lxid = InvalidLocalTransactionId; - proc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&proc->xmin, InvalidTransactionId); /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; @@ -788,7 +786,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) uint32 wakeidx; /* We should definitely have an XID to clear. */ - Assert(TransactionIdIsValid(proc->xid)); + Assert(TransactionIdIsValid(pg_atomic_read_u64(&proc->xid))); /* Add ourselves to the list of processes needing a group XID clear. */ proc->procArrayGroupMember = true; @@ -917,11 +915,11 @@ ProcArrayClearTransaction(PGPROC *proc) pgxactoff = proc->pgxactoff; - ProcGlobal->xids[pgxactoff] = InvalidTransactionId; - proc->xid = InvalidTransactionId; + pg_atomic_write_u64(&ProcGlobal->xids[pgxactoff], InvalidTransactionId); + pg_atomic_write_u64(&proc->xid, InvalidTransactionId); proc->lxid = InvalidLocalTransactionId; - proc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&proc->xmin, InvalidTransactionId); proc->recoveryConflictPending = false; Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); @@ -965,8 +963,7 @@ MaintainLatestCompletedXid(TransactionId latestXid) if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) { - ShmemVariableCache->latestCompletedXid = - FullXidRelativeTo(cur_latest, latestXid); + ShmemVariableCache->latestCompletedXid = FullTransactionIdFromXid(latestXid); } Assert(IsBootstrapProcessingMode() || @@ -980,7 +977,6 @@ static void MaintainLatestCompletedXidRecovery(TransactionId latestXid) { FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; - FullTransactionId rel; Assert(AmStartupProcess() || !IsUnderPostmaster); Assert(LWLockHeldByMe(ProcArrayLock)); @@ -990,14 +986,12 @@ MaintainLatestCompletedXidRecovery(TransactionId latestXid) * latestCompletedXid to be initialized in recovery. But in recovery it's * safe to access nextXid without a lock for the startup process. */ - rel = ShmemVariableCache->nextXid; Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); if (!FullTransactionIdIsValid(cur_latest) || TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) { - ShmemVariableCache->latestCompletedXid = - FullXidRelativeTo(rel, latestXid); + ShmemVariableCache->latestCompletedXid = FullTransactionIdFromXid(latestXid); } Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); @@ -1375,7 +1369,7 @@ bool TransactionIdIsInProgress(TransactionId xid) { static TransactionId *xids = NULL; - static TransactionId *other_xids; + static pg_atomic_uint64 *other_xids; XidCacheStatus *other_subxidstates; int nxids = 0; ProcArrayStruct *arrayP = procArray; @@ -1471,7 +1465,7 @@ TransactionIdIsInProgress(TransactionId xid) continue; /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + pxid = pg_atomic_read_u64(&(other_xids[pgxactoff])); if (!TransactionIdIsValid(pxid)) continue; @@ -1503,7 +1497,7 @@ TransactionIdIsInProgress(TransactionId xid) for (j = pxids - 1; j >= 0; j--) { /* Fetch xid just once - see GetNewTransactionId */ - TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]); + TransactionId cxid = UINT64_ACCESS_ONCE(proc->subxids.xids[j]); if (TransactionIdEquals(cxid, xid)) { @@ -1588,7 +1582,7 @@ TransactionIdIsInProgress(TransactionId xid) topxid = SubTransGetTopmostTransaction(xid); Assert(TransactionIdIsValid(topxid)); if (!TransactionIdEquals(topxid, xid) && - pg_lfind32(topxid, xids, nxids)) + pg_lfind64(topxid, xids, nxids)) return true; cachedXidIsNotInProgress = xid; @@ -1608,7 +1602,7 @@ TransactionIdIsActive(TransactionId xid) { bool result = false; ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; int i; /* @@ -1627,7 +1621,7 @@ TransactionIdIsActive(TransactionId xid) TransactionId pxid; /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(other_xids[i]); + pxid = pg_atomic_read_u64(&(other_xids[i])); if (!TransactionIdIsValid(pxid)) continue; @@ -1713,7 +1707,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) ProcArrayStruct *arrayP = procArray; TransactionId kaxmin; bool in_recovery = RecoveryInProgress(); - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; /* inferred after ProcArrayLock is released */ h->catalog_oldest_nonremovable = InvalidTransactionId; @@ -1729,7 +1723,8 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) * additions. */ { - TransactionId initial; + TransactionId initial, + xid; initial = XidFromFullTransactionId(h->latest_completed); Assert(TransactionIdIsValid(initial)); @@ -1751,8 +1746,9 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) * definition, can't be any newer changes in the temp table than * latestCompletedXid. */ - if (TransactionIdIsValid(MyProc->xid)) - h->temp_oldest_nonremovable = MyProc->xid; + xid = pg_atomic_read_u64(&MyProc->xid); + if (TransactionIdIsValid(xid)) + h->temp_oldest_nonremovable = xid; else h->temp_oldest_nonremovable = initial; } @@ -1774,8 +1770,8 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) TransactionId xmin; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(other_xids[index]); - xmin = UINT32_ACCESS_ONCE(proc->xmin); + xid = pg_atomic_read_u64(&(other_xids[index])); + xmin = pg_atomic_read_u64(&proc->xmin); /* * Consider both the transaction's Xmin, and its Xid. @@ -2150,8 +2146,8 @@ GetSnapshotDataReuse(Snapshot snapshot) * requirement that concurrent GetSnapshotData() calls yield the same * xmin. */ - if (!TransactionIdIsValid(MyProc->xmin)) - MyProc->xmin = TransactionXmin = snapshot->xmin; + if (!TransactionIdIsValid(pg_atomic_read_u64(&MyProc->xmin))) + pg_atomic_write_u64(&MyProc->xmin, TransactionXmin = snapshot->xmin); RecentXmin = snapshot->xmin; Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); @@ -2203,7 +2199,7 @@ Snapshot GetSnapshotData(Snapshot snapshot) { ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; TransactionId xmin; TransactionId xmax; int count = 0; @@ -2266,8 +2262,8 @@ GetSnapshotData(Snapshot snapshot) latest_completed = ShmemVariableCache->latestCompletedXid; mypgxactoff = MyProc->pgxactoff; - myxid = other_xids[mypgxactoff]; - Assert(myxid == MyProc->xid); + myxid = pg_atomic_read_u64(&other_xids[mypgxactoff]); + Assert(myxid == pg_atomic_read_u64(&MyProc->xid)); oldestxid = ShmemVariableCache->oldestXid; curXactCompletionCount = ShmemVariableCache->xactCompletionCount; @@ -2301,7 +2297,7 @@ GetSnapshotData(Snapshot snapshot) for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) { /* Fetch xid just once - see GetNewTransactionId */ - TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + TransactionId xid = pg_atomic_read_u64(&(other_xids[pgxactoff])); uint8 statusFlags; Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); @@ -2438,8 +2434,8 @@ GetSnapshotData(Snapshot snapshot) replication_slot_xmin = procArray->replication_slot_xmin; replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; - if (!TransactionIdIsValid(MyProc->xmin)) - MyProc->xmin = TransactionXmin = xmin; + if (!TransactionIdIsValid(pg_atomic_read_u64(&MyProc->xmin))) + pg_atomic_write_u64(&MyProc->xmin, TransactionXmin = xmin); LWLockRelease(ProcArrayLock); @@ -2451,12 +2447,7 @@ GetSnapshotData(Snapshot snapshot) FullTransactionId def_vis_fxid_data; FullTransactionId oldestfxid; - /* - * Converting oldestXid is only safe when xid horizon cannot advance, - * i.e. holding locks. While we don't hold the lock anymore, all the - * necessary data has been gathered with lock held. - */ - oldestfxid = FullXidRelativeTo(latest_completed, oldestxid); + oldestfxid = FullTransactionIdFromXid(oldestxid); /* apply vacuum_defer_cleanup_age */ def_vis_xid_data = @@ -2479,8 +2470,8 @@ GetSnapshotData(Snapshot snapshot) def_vis_xid = TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid); - def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid); - def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data); + def_vis_fxid = FullTransactionIdFromXid(def_vis_xid); + def_vis_fxid_data = FullTransactionIdFromXid(def_vis_xid_data); /* * Check if we can increase upper bound. As a previous @@ -2499,7 +2490,7 @@ GetSnapshotData(Snapshot snapshot) /* See temp_oldest_nonremovable computation in ComputeXidHorizons() */ if (TransactionIdIsNormal(myxid)) GlobalVisTempRels.definitely_needed = - FullXidRelativeTo(latest_completed, myxid); + FullTransactionIdFromXid(myxid); else { GlobalVisTempRels.definitely_needed = latest_completed; @@ -2606,7 +2597,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, /* * Likewise, let's just make real sure its xmin does cover us. */ - xid = UINT32_ACCESS_ONCE(proc->xmin); + xid = pg_atomic_read_u64(&proc->xmin); if (!TransactionIdIsNormal(xid) || !TransactionIdPrecedesOrEquals(xid, xmin)) continue; @@ -2617,7 +2608,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, * GetSnapshotData first, we'll be overwriting a valid xmin here, so * we don't check that.) */ - MyProc->xmin = TransactionXmin = xmin; + pg_atomic_write_u64(&MyProc->xmin, TransactionXmin = xmin); result = true; break; @@ -2661,7 +2652,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * can't go backwards. Also, make sure it's running in the same database, * so that the per-database xmin cannot go backwards. */ - xid = UINT32_ACCESS_ONCE(proc->xmin); + xid = pg_atomic_read_u64(&proc->xmin); if (proc->databaseId == MyDatabaseId && TransactionIdIsNormal(xid) && TransactionIdPrecedesOrEquals(xid, xmin)) @@ -2670,7 +2661,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * Install xmin and propagate the statusFlags that affect how the * value is interpreted by vacuum. */ - MyProc->xmin = TransactionXmin = xmin; + pg_atomic_write_u64(&MyProc->xmin, TransactionXmin = xmin); MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) | (proc->statusFlags & PROC_XMIN_FLAGS); ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; @@ -2721,7 +2712,7 @@ GetRunningTransactionData(void) static RunningTransactionsData CurrentRunningXactsData; ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData; TransactionId latestCompletedXid; TransactionId oldestRunningXid; @@ -2780,7 +2771,7 @@ GetRunningTransactionData(void) TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(other_xids[index]); + xid = pg_atomic_read_u64(&(other_xids[index])); /* * We don't need to store transactions that don't have a TransactionId @@ -2893,7 +2884,7 @@ TransactionId GetOldestActiveTransactionId(void) { ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; TransactionId oldestRunningXid; int index; @@ -2919,7 +2910,7 @@ GetOldestActiveTransactionId(void) TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(other_xids[index]); + xid = pg_atomic_read_u64(&(other_xids[index])); if (!TransactionIdIsNormal(xid)) continue; @@ -3007,7 +2998,7 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) */ if (!recovery_in_progress) { - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; /* * Spin over procArray collecting min(ProcGlobal->xids[i]) @@ -3017,7 +3008,7 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) TransactionId xid; /* Fetch xid just once - see GetNewTransactionId */ - xid = UINT32_ACCESS_ONCE(other_xids[index]); + xid = pg_atomic_read_u64(&(other_xids[index])); if (!TransactionIdIsNormal(xid)) continue; @@ -3212,7 +3203,7 @@ BackendXidGetPid(TransactionId xid) { int result = 0; ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; + pg_atomic_uint64 *other_xids = ProcGlobal->xids; int index; if (xid == InvalidTransactionId) /* never match invalid xid */ @@ -3225,7 +3216,7 @@ BackendXidGetPid(TransactionId xid) int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - if (other_xids[index] == xid) + if (pg_atomic_read_u64(&other_xids[index]) == xid) { result = proc->pid; break; @@ -3306,7 +3297,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, if (allDbs || proc->databaseId == MyDatabaseId) { /* Fetch xmin just once - might change on us */ - TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + TransactionId pxmin = pg_atomic_read_u64(&proc->xmin); if (excludeXmin0 && !TransactionIdIsValid(pxmin)) continue; @@ -3401,7 +3392,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) proc->databaseId == dbOid) { /* Fetch xmin just once - can't change on us, but good coding */ - TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + TransactionId pxmin = pg_atomic_read_u64(&proc->xmin); /* * We ignore an invalid pxmin because this means that backend has @@ -3528,7 +3519,7 @@ MinimumActiveBackends(int min) continue; /* do not count deleted entries */ if (proc == MyProc) continue; /* do not count myself */ - if (proc->xid == InvalidTransactionId) + if (pg_atomic_read_u64(&proc->xid) == InvalidTransactionId) continue; /* do not count if no XID assigned */ if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -4108,17 +4099,13 @@ static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons) { GlobalVisSharedRels.maybe_needed = - FullXidRelativeTo(horizons->latest_completed, - horizons->shared_oldest_nonremovable); + FullTransactionIdFromXid(horizons->shared_oldest_nonremovable); GlobalVisCatalogRels.maybe_needed = - FullXidRelativeTo(horizons->latest_completed, - horizons->catalog_oldest_nonremovable); + FullTransactionIdFromXid(horizons->catalog_oldest_nonremovable); GlobalVisDataRels.maybe_needed = - FullXidRelativeTo(horizons->latest_completed, - horizons->data_oldest_nonremovable); + FullTransactionIdFromXid(horizons->data_oldest_nonremovable); GlobalVisTempRels.maybe_needed = - FullXidRelativeTo(horizons->latest_completed, - horizons->temp_oldest_nonremovable); + FullTransactionIdFromXid(horizons->temp_oldest_nonremovable); /* * In longer running transactions it's possible that transactions we @@ -4207,15 +4194,7 @@ GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) { FullTransactionId fxid; - /* - * Convert 32 bit argument to FullTransactionId. We can do so safely - * because we know the xid has to, at the very least, be between - * [oldestXid, nextXid), i.e. within 2 billion of xid. To avoid taking a - * lock to determine either, we can just compare with - * state->definitely_needed, which was based on those value at the time - * the current snapshot was built. - */ - fxid = FullXidRelativeTo(state->definitely_needed, xid); + fxid = FullTransactionIdFromXid(xid); return GlobalVisTestIsRemovableFullXid(state, fxid); } @@ -4278,32 +4257,6 @@ GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) return GlobalVisTestIsRemovableXid(state, xid); } -/* - * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it - * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). - * - * Be very careful about when to use this function. It can only safely be used - * when there is a guarantee that xid is within MaxTransactionId / 2 xids of - * rel. That e.g. can be guaranteed if the caller assures a snapshot is - * held by the backend and xid is from a table (where vacuum/freezing ensures - * the xid has to be within that range), or if xid is from the procarray and - * prevents xid wraparound that way. - */ -static inline FullTransactionId -FullXidRelativeTo(FullTransactionId rel, TransactionId xid) -{ - TransactionId rel_xid = XidFromFullTransactionId(rel); - - Assert(TransactionIdIsValid(xid)); - Assert(TransactionIdIsValid(rel_xid)); - - /* not guaranteed to find issues, but likely to catch mistakes */ - AssertTransactionIdInAllowableRange(xid); - - return FullTransactionIdFromU64(U64FromFullTransactionId(rel) - + (int32) (xid - rel_xid)); -} - /* ---------------------------------------------- * KnownAssignedTransactionIds sub-module diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c index 59310b708f..bf712ba7ee 100644 --- a/src/backend/storage/ipc/sinvaladt.c +++ b/src/backend/storage/ipc/sinvaladt.c @@ -426,8 +426,8 @@ BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmi if (proc != NULL) { - *xid = proc->xid; - *xmin = proc->xmin; + *xid = pg_atomic_read_u64(&proc->xid); + *xmin = pg_atomic_read_u64(&proc->xmin); } } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 99341c3e87..5f1bed8b56 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -37,7 +37,7 @@ #include "utils/timestamp.h" /* User-settable GUC parameters */ -int vacuum_defer_cleanup_age; +int64 vacuum_defer_cleanup_age; int max_standby_archive_delay = 30 * 1000; int max_standby_streaming_delay = 30 * 1000; bool log_recovery_conflict_waits = false; @@ -486,8 +486,8 @@ ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXi FullTransactionId nextXid = ReadNextFullTransactionId(); uint64 diff; - diff = U64FromFullTransactionId(nextXid) - - U64FromFullTransactionId(latestRemovedFullXid); + diff = XidFromFullTransactionId(nextXid) - + XidFromFullTransactionId(latestRemovedFullXid); if (diff < MaxTransactionId / 2) { TransactionId latestRemovedXid; diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 1043068bac..0aa3ae79ac 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -1163,10 +1163,18 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field1); break; case LOCKTAG_TRANSACTION: - appendStringInfo(buf, - _("transaction %u"), - tag->locktag_field1); - break; + { + char xid_str[32]; + + /* make translatable string */ + snprintf(xid_str, sizeof(xid_str), "%llu", + (unsigned long long) + (TransactionId) tag->locktag_field1 | + ((TransactionId) tag->locktag_field2 << 32)); + + appendStringInfo(buf, _("transaction %s"), xid_str); + break; + } case LOCKTAG_VIRTUALTRANSACTION: appendStringInfo(buf, _("virtual transaction %d/%u"), diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 3d1049cf75..68ce82ddf8 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -4060,7 +4060,7 @@ GetRunningTransactionLocks(int *nlocks) { PGPROC *proc = proclock->tag.myProc; LOCK *lock = proclock->tag.myLock; - TransactionId xid = proc->xid; + TransactionId xid = pg_atomic_read_u64(&proc->xid); /* * Don't record locks for transactions if we know they have @@ -4689,7 +4689,7 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) * so we won't save an XID of a different VXID. It doesn't matter whether * we save this before or after setting up the primary lock table entry. */ - xid = proc->xid; + xid = pg_atomic_read_u64(&proc->xid); /* Done with proc->fpLockBits */ LWLockRelease(&proc->fpInfoLock); diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index b71a60952d..529e01eb8f 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -334,9 +334,9 @@ static SlruCtlData SerialSlruCtlData; #define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \ (SerialSlruCtl->shared->page_buffer[slotno] + \ - ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE)))) + ((((uint64) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE)))) -#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE) +#define SerialPage(xid) ((int64) (((uint64) (xid)) / SERIAL_ENTRIESPERPAGE)) typedef struct SerialControlData { @@ -4078,7 +4078,7 @@ XidIsConcurrent(TransactionId xid) if (TransactionIdFollowsOrEquals(xid, snap->xmax)) return true; - return pg_lfind32(xid, snap->xip, snap->xcnt); + return pg_lfind64(xid, snap->xip, snap->xcnt); } bool diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 13fa07b0ff..7c81b1aa6a 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -205,7 +205,7 @@ InitProcGlobal(void) * how hotly they are accessed. */ ProcGlobal->xids = - (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); + (pg_atomic_uint64 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates)); MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates)); @@ -214,6 +214,7 @@ InitProcGlobal(void) for (i = 0; i < TotalProcs; i++) { + pg_atomic_init_u64(&ProcGlobal->xids[i], 0); /* Common initialization for all PGPROCs, regardless of type. */ /* @@ -383,8 +384,8 @@ InitProcess(void) MyProc->lxid = InvalidLocalTransactionId; MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; - MyProc->xid = InvalidTransactionId; - MyProc->xmin = InvalidTransactionId; + pg_atomic_init_u64(&MyProc->xid, InvalidTransactionId); + pg_atomic_init_u64(&MyProc->xmin, InvalidTransactionId); MyProc->pid = MyProcPid; /* backendId, databaseId and roleId will be filled in later */ MyProc->backendId = InvalidBackendId; @@ -570,8 +571,8 @@ InitAuxiliaryProcess(void) MyProc->lxid = InvalidLocalTransactionId; MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; - MyProc->xid = InvalidTransactionId; - MyProc->xmin = InvalidTransactionId; + pg_atomic_init_u64(&MyProc->xid, InvalidTransactionId); + pg_atomic_init_u64(&MyProc->xmin, InvalidTransactionId); MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 8b617c7e79..198fdb87e6 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -21,11 +21,31 @@ #include "storage/checksum.h" #include "utils/memdebug.h" #include "utils/memutils.h" +#include "utils/snapmgr.h" /* GUC variable */ bool ignore_checksum_failure = false; +/* + * HeapPageSpecialData used when pd_special == BLCKSZ. This is special format + * used when page with 32-bit xids doesn't fit HeapPageSpecialData. Then + * all xmin's are frozen (can do this for all live tuples after pg_upgrade), + * while 64-bit xmax is stored in both t_heap.t_xmin and t_heap.t_xmax. + * This is so-called "double xmax" format. + */ +static HeapPageSpecialData heapDoubleXmaxSpecialData = +{ + .pd_xid_base = MaxTransactionId, + .pd_multi_base = MaxTransactionId +}; +HeapPageSpecial heapDoubleXmaxSpecial = &heapDoubleXmaxSpecialData; + +static ToastPageSpecialData toastDoubleXmaxSpecialData = +{ + .pd_xid_base = MaxTransactionId +}; +ToastPageSpecial toastDoubleXmaxSpecial = &toastDoubleXmaxSpecialData; /* ---------------------------------------------------------------- * Page support functions @@ -432,15 +452,144 @@ PageRestoreTempPage(Page tempPage, Page oldPage) } /* - * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete + * Get minimum and maximum values of xid and multixact on "double xmax" page. */ -typedef struct itemIdCompactData +static void +heap_page_double_xmax_get_min_max(Page page, + TransactionId *xid_min, + TransactionId *xid_max, + MultiXactId *multi_min, + MultiXactId *multi_max) { - uint16 offsetindex; /* linp array index */ - int16 itemoff; /* page offset of item data */ - uint16 alignedlen; /* MAXALIGN(item data len) */ -} itemIdCompactData; -typedef itemIdCompactData *itemIdCompact; + bool xid_found = false, + multi_found = false; + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleHeader htup; + TransactionId xmax; + + itemid = PageGetItemId(page, offnum); + + if (!ItemIdIsNormal(itemid)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + xmax = HeapTupleHeaderGetDoubleXmax(htup); + + if (!TransactionIdIsNormal(xmax)) + continue; + + if (!(htup->t_infomask & HEAP_XMAX_IS_MULTI)) + { + if (!xid_found) + { + *xid_min = *xid_max = xmax; + xid_found = true; + } + else + { + *xid_min = Min(*xid_min, xmax); + *xid_max = Max(*xid_max, xmax); + } + } + else + { + if (!multi_found) + { + *multi_min = *multi_max = xmax; + multi_found = true; + } + else + { + *multi_min = Min(*multi_min, xmax); + *multi_max = Max(*multi_max, xmax); + } + } + } +} + +/* + * Add special area to heap page, so convert from "double xmax" to normal + * format. + */ +static void +heap_page_add_special_area(ItemIdCompact itemidbase, int nitems, Page page, + TransactionId xid_base, MultiXactId multi_base, + bool is_toast) +{ + char newPage[BLCKSZ]; + PageHeader phdr = (PageHeader) page; + PageHeader new_phdr = (PageHeader) newPage; + Offset upper; + int i; + + memcpy(newPage, page, phdr->pd_lower); + + /* Add special area */ + if (is_toast) + { + ToastPageSpecial special; + + new_phdr->pd_special = PageGetPageSize(newPage) - sizeof(ToastPageSpecialData); + special = (ToastPageSpecial) ((Pointer) (newPage) + new_phdr->pd_special); + special->pd_xid_base = xid_base; + } + else + { + HeapPageSpecial special; + + new_phdr->pd_special = PageGetPageSize(newPage) - sizeof(HeapPageSpecialData); + special = (HeapPageSpecial) ((Pointer) (newPage) + new_phdr->pd_special); + special->pd_xid_base = xid_base; + special->pd_multi_base = multi_base; + } + + /* sort itemIdSortData array into decreasing itemoff order */ + qsort((char *) itemidbase, nitems, sizeof(ItemIdCompactData), + itemoffcompare); + + upper = new_phdr->pd_special; + for (i = 0; i < nitems; i++) + { + ItemIdCompact itemidptr = &itemidbase[i]; + ItemId lp; + HeapTupleHeader old_htup; + HeapTupleHeader new_htup; + TransactionId xmax; + + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + old_htup = (HeapTupleHeader) PageGetItem(page, lp); + upper -= itemidptr->alignedlen; + memcpy((Pointer) newPage + upper, + (Pointer) page + itemidptr->itemoff, + itemidptr->alignedlen); + lp = PageGetItemId(newPage, itemidptr->offsetindex + 1); + lp->lp_off = upper; + new_htup = (HeapTupleHeader) PageGetItem(newPage, lp); + + /* Convert xmax value */ + new_htup->t_choice.t_heap.t_xmin = FrozenTransactionId; + xmax = HeapTupleHeaderGetDoubleXmax(old_htup); + if (!(new_htup->t_infomask & HEAP_XMAX_IS_MULTI)) + new_htup->t_choice.t_heap.t_xmax = NormalTransactionIdToShort(xid_base, xmax); + else + new_htup->t_choice.t_heap.t_xmax = NormalTransactionIdToShort(multi_base, xmax); + } + + new_phdr->pd_upper = upper; + + memcpy(page, newPage, PageGetPageSize(newPage)); + elog(DEBUG2, "convert heap page from double xmax to normal format"); +} /* * After removing or marking some line pointers unused, move the tuples to @@ -471,21 +620,47 @@ typedef itemIdCompactData *itemIdCompact; * Callers must ensure that nitems is > 0 */ static void -compactify_tuples(itemIdCompact itemidbase, int nitems, Page page, bool presorted) +compactify_tuples(ItemIdCompact itemidbase, int nitems, Page page, + bool presorted, bool addspecial, bool is_toast) { PageHeader phdr = (PageHeader) page; Offset upper; Offset copy_tail; Offset copy_head; - itemIdCompact itemidptr; + ItemIdCompact itemidptr; int i; /* Code within will not work correctly if nitems == 0 */ Assert(nitems > 0); - if (presorted) + /* Add special area to the heap page if possible */ + if (addspecial) { + TransactionId xid_min = FirstNormalTransactionId, + xid_max = FirstNormalTransactionId; + MultiXactId multi_min = FirstNormalTransactionId, + multi_max = FirstNormalTransactionId; + Assert(phdr->pd_special == PageGetPageSize(page)); + + heap_page_double_xmax_get_min_max(page, &xid_min, &xid_max, + &multi_min, &multi_max); + + if (xid_max - xid_min < (TransactionId) (MaxShortTransactionId - FirstNormalTransactionId) && + multi_max - multi_min < (TransactionId) (MaxShortTransactionId - FirstNormalTransactionId)) + { + Assert(xid_min >= FirstNormalTransactionId); + Assert(multi_min >= FirstNormalTransactionId); + heap_page_add_special_area(itemidbase, nitems, page, + xid_min - FirstNormalTransactionId, + multi_min - FirstNormalTransactionId, + is_toast); + return; + } + } + + if (presorted) + { #ifdef USE_ASSERT_CHECKING { /* @@ -696,14 +871,14 @@ compactify_tuples(itemIdCompact itemidbase, int nitems, Page page, bool presorte * the line pointer array following array truncation. */ void -PageRepairFragmentation(Page page) +PageRepairFragmentation(Page page, bool is_toast) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; Offset last_offset; - itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; - itemIdCompact itemidptr; + ItemIdCompactData itemidbase[MaxHeapTuplesPerPage]; + ItemIdCompact itemidptr; ItemId lp; int nline, nstorage, @@ -777,11 +952,30 @@ PageRepairFragmentation(Page page) nstorage = itemidptr - itemidbase; if (nstorage == 0) { + if (pd_special == PageGetPageSize(page)) + { + if (is_toast) + { + pd_special = PageGetPageSize(page) - sizeof(ToastPageSpecialData); + ((PageHeader) page)->pd_special = pd_special; + ToastPageGetSpecial(page)->pd_xid_base = 0; + } + else + { + pd_special = PageGetPageSize(page) - sizeof(HeapPageSpecialData); + ((PageHeader) page)->pd_special = pd_special; + HeapPageGetSpecial(page)->pd_xid_base = 0; + HeapPageGetSpecial(page)->pd_multi_base = 0; + } + } + /* Page is completely empty, so just reset it quickly */ ((PageHeader) page)->pd_upper = pd_special; } else { + bool addspecial = false; + /* Need to compact the page the hard way */ if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, @@ -789,7 +983,25 @@ PageRepairFragmentation(Page page) errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); - compactify_tuples(itemidbase, nstorage, page, presorted); + /* + * Try to add special area to the heap page if it has enough of free + * space. + */ + if (pd_special == PageGetPageSize(page)) + { + Size special_size, + actual_size; + + special_size = is_toast ? sizeof(ToastPageSpecialData) : + sizeof(HeapPageSpecialData); + actual_size = (Size) (pd_special - pd_lower) - totallen; + + if (actual_size >= special_size) + addspecial = true; + } + + compactify_tuples(itemidbase, nstorage, page, presorted, addspecial, + is_toast); } if (finalusedlp != nline) @@ -992,6 +1204,9 @@ PageGetHeapFreeSpace(Page page) { Size space; + if (HeapPageIsDoubleXmax(page)) + return 0; + space = PageGetFreeSpace(page); if (space > 0) { @@ -1165,9 +1380,9 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) Offset pd_upper = phdr->pd_upper; Offset pd_special = phdr->pd_special; Offset last_offset; - itemIdCompactData itemidbase[MaxIndexTuplesPerPage]; + ItemIdCompactData itemidbase[MaxIndexTuplesPerPage]; ItemIdData newitemids[MaxIndexTuplesPerPage]; - itemIdCompact itemidptr; + ItemIdCompact itemidptr; ItemId lp; int nline, nused; @@ -1275,7 +1490,12 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) /* and compactify the tuple data */ if (nused > 0) - compactify_tuples(itemidbase, nused, page, presorted); + { + bool is_toast; + + is_toast = BLCKSZ - pd_special == sizeof(ToastPageSpecialData); + compactify_tuples(itemidbase, nused, page, presorted, false, is_toast); + } else phdr->pd_upper = pd_special; } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 662b5b50db..9eebbe1d25 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3786,15 +3786,13 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, case 'm': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxid = value; + start_mxid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxid) /* overflow */ + !StartMultiXactIdIsValid(start_mxid)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -3817,15 +3815,13 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, case 'o': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxoff = value; + start_mxoff = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxoff) /* overflow */ + !StartMultiXactOffsetIsValid(start_mxoff)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -3890,15 +3886,13 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx, case 'x': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_xid = value; + start_xid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_xid) /* overflow */ + !StartTransactionIdIsValid(start_xid)) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -4085,7 +4079,6 @@ PostgresSingleUserMain(int argc, char *argv[], PostgresMain(dbname, username); } - /* ---------------------------------------------------------------- * PostgresMain * postgres main loop -- all backends, interactive or otherwise loop here diff --git a/src/backend/utils/adt/enum.c b/src/backend/utils/adt/enum.c index 0cc7a6d8ad..04ca6ff16e 100644 --- a/src/backend/utils/adt/enum.c +++ b/src/backend/utils/adt/enum.c @@ -76,7 +76,7 @@ check_safe_enum_use(HeapTuple enumval_tup) * Usually, a row would get hinted as committed when it's read or loaded * into syscache; but just in case not, let's check the xmin directly. */ - xmin = HeapTupleHeaderGetXmin(enumval_tup->t_data); + xmin = HeapTupleGetXmin(enumval_tup); if (!TransactionIdIsInProgress(xmin) && TransactionIdDidCommit(xmin)) return; diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index fd0d355789..78ffef1071 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -3661,6 +3661,7 @@ populate_recordset_record(PopulateRecordsetState *state, JsObject *obj) tuple.t_len = HeapTupleHeaderGetDatumLength(tuphead); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tuple); tuple.t_data = tuphead; tuplestore_puttuple(state->tuple_store, &tuple); diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index 14151bc81c..6fbdfdfb82 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -77,7 +77,7 @@ VXIDGetDatum(BackendId bid, LocalTransactionId lxid) * The representation is "/", decimal and unsigned decimal * respectively. Note that elog.c also knows how to format a vxid. */ - char vxidstr[32]; + char vxidstr[64]; snprintf(vxidstr, sizeof(vxidstr), "%d/%llu", bid, (unsigned long long) lxid); @@ -291,7 +291,9 @@ pg_lock_status(PG_FUNCTION_ARGS) break; case LOCKTAG_TRANSACTION: values[6] = - TransactionIdGetDatum(instance->locktag.locktag_field1); + TransactionIdGetDatum( + (TransactionId) instance->locktag.locktag_field1 | + ((TransactionId) instance->locktag.locktag_field2 << 32)); nulls[1] = true; nulls[2] = true; nulls[3] = true; @@ -303,7 +305,8 @@ pg_lock_status(PG_FUNCTION_ARGS) break; case LOCKTAG_VIRTUALTRANSACTION: values[5] = VXIDGetDatum(instance->locktag.locktag_field1, - instance->locktag.locktag_field2); + (TransactionId) instance->locktag.locktag_field2 | + ((TransactionId) instance->locktag.locktag_field3 << 32)); nulls[1] = true; nulls[2] = true; nulls[3] = true; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index eadd8464ff..2f89562dc4 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/xact.h" #include "access/xlog.h" #include "access/xlogprefetcher.h" #include "catalog/pg_authid.h" diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c index db843a0fbf..5e08d02d64 100644 --- a/src/backend/utils/adt/rowtypes.c +++ b/src/backend/utils/adt/rowtypes.c @@ -327,6 +327,7 @@ record_out(PG_FUNCTION_ARGS) tuple.t_len = HeapTupleHeaderGetDatumLength(rec); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tuple); tuple.t_data = rec; /* @@ -694,6 +695,7 @@ record_send(PG_FUNCTION_ARGS) tuple.t_len = HeapTupleHeaderGetDatumLength(rec); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tuple); tuple.t_data = rec; /* @@ -844,10 +846,12 @@ record_cmp(FunctionCallInfo fcinfo) tuple1.t_len = HeapTupleHeaderGetDatumLength(record1); ItemPointerSetInvalid(&(tuple1.t_self)); tuple1.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tuple1); tuple1.t_data = record1; tuple2.t_len = HeapTupleHeaderGetDatumLength(record2); ItemPointerSetInvalid(&(tuple2.t_self)); tuple2.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tuple2); tuple2.t_data = record2; /* @@ -1089,10 +1093,12 @@ record_eq(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple1.t_self)); tuple1.t_tableOid = InvalidOid; tuple1.t_data = record1; + HeapTupleSetZeroBase(&tuple1); tuple2.t_len = HeapTupleHeaderGetDatumLength(record2); ItemPointerSetInvalid(&(tuple2.t_self)); tuple2.t_tableOid = InvalidOid; tuple2.t_data = record2; + HeapTupleSetZeroBase(&tuple2); /* * We arrange to look up the needed comparison info just once per series @@ -1351,10 +1357,12 @@ record_image_cmp(FunctionCallInfo fcinfo) ItemPointerSetInvalid(&(tuple1.t_self)); tuple1.t_tableOid = InvalidOid; tuple1.t_data = record1; + HeapTupleSetZeroBase(&tuple1); tuple2.t_len = HeapTupleHeaderGetDatumLength(record2); ItemPointerSetInvalid(&(tuple2.t_self)); tuple2.t_tableOid = InvalidOid; tuple2.t_data = record2; + HeapTupleSetZeroBase(&tuple2); /* * We arrange to look up the needed comparison info just once per series @@ -1597,10 +1605,12 @@ record_image_eq(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple1.t_self)); tuple1.t_tableOid = InvalidOid; tuple1.t_data = record1; + HeapTupleSetZeroBase(&tuple1); tuple2.t_len = HeapTupleHeaderGetDatumLength(record2); ItemPointerSetInvalid(&(tuple2.t_self)); tuple2.t_tableOid = InvalidOid; tuple2.t_data = record2; + HeapTupleSetZeroBase(&tuple2); /* * We arrange to look up the needed comparison info just once per series @@ -1800,6 +1810,7 @@ hash_record(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = record; + HeapTupleSetZeroBase(&tuple); /* * We arrange to look up the needed hashing info just once per series of @@ -1921,6 +1932,7 @@ hash_record_extended(PG_FUNCTION_ARGS) ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = record; + HeapTupleSetZeroBase(&tuple); /* * We arrange to look up the needed hashing info just once per series of diff --git a/src/backend/utils/adt/xid.c b/src/backend/utils/adt/xid.c index e4b4952a28..056752cfaf 100644 --- a/src/backend/utils/adt/xid.c +++ b/src/backend/utils/adt/xid.c @@ -32,16 +32,16 @@ xidin(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); - PG_RETURN_TRANSACTIONID((TransactionId) strtoul(str, NULL, 0)); + PG_RETURN_TRANSACTIONID((TransactionId) strtou64(str, NULL, 0)); } Datum xidout(PG_FUNCTION_ARGS) { TransactionId transactionId = PG_GETARG_TRANSACTIONID(0); - char *result = (char *) palloc(16); + char *result = (char *) palloc(32); - snprintf(result, 16, "%lu", (unsigned long) transactionId); + snprintf(result, 32, "%llu", (unsigned long long) transactionId); PG_RETURN_CSTRING(result); } @@ -52,8 +52,13 @@ Datum xidrecv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + uint32 lo, + hi; + + lo = (uint32) pq_getmsgint(buf, sizeof(TransactionId)); + hi = (uint32) pq_getmsgint(buf, sizeof(TransactionId)); - PG_RETURN_TRANSACTIONID((TransactionId) pq_getmsgint(buf, sizeof(TransactionId))); + PG_RETURN_TRANSACTIONID((uint64) lo + ((uint64) hi << 32)); } /* @@ -64,9 +69,15 @@ xidsend(PG_FUNCTION_ARGS) { TransactionId arg1 = PG_GETARG_TRANSACTIONID(0); StringInfoData buf; + uint32 lo, + hi; + + lo = (uint32) (arg1 & 0xFFFFFFFF); + hi = (uint32) (arg1 >> 32); pq_begintypsend(&buf); - pq_sendint32(&buf, arg1); + pq_sendint(&buf, lo, sizeof(lo)); + pq_sendint(&buf, hi, sizeof(hi)); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -105,9 +116,9 @@ xid_age(PG_FUNCTION_ARGS) /* Permanent XIDs are always infinitely old */ if (!TransactionIdIsNormal(xid)) - PG_RETURN_INT32(INT_MAX); + PG_RETURN_INT64(PG_INT8_MAX); - PG_RETURN_INT32((int32) (now - xid)); + PG_RETURN_INT64((int64) (now - xid)); } /* @@ -120,9 +131,9 @@ mxid_age(PG_FUNCTION_ARGS) MultiXactId now = ReadNextMultiXactId(); if (!MultiXactIdIsValid(xid)) - PG_RETURN_INT32(INT_MAX); + PG_RETURN_INT64(PG_INT8_MAX); - PG_RETURN_INT32((int32) (now - xid)); + PG_RETURN_INT64((int64) (now - xid)); } /* @@ -184,7 +195,7 @@ xid8in(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); - PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromU64(strtou64(str, NULL, 0))); + PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromXid(strtou64(str, NULL, 0))); } Datum @@ -193,7 +204,7 @@ xid8out(PG_FUNCTION_ARGS) FullTransactionId fxid = PG_GETARG_FULLTRANSACTIONID(0); char *result = (char *) palloc(21); - snprintf(result, 21, UINT64_FORMAT, U64FromFullTransactionId(fxid)); + snprintf(result, 21, UINT64_FORMAT, XidFromFullTransactionId(fxid)); PG_RETURN_CSTRING(result); } @@ -204,7 +215,7 @@ xid8recv(PG_FUNCTION_ARGS) uint64 value; value = (uint64) pq_getmsgint64(buf); - PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromU64(value)); + PG_RETURN_FULLTRANSACTIONID(FullTransactionIdFromXid(value)); } Datum @@ -214,7 +225,7 @@ xid8send(PG_FUNCTION_ARGS) StringInfoData buf; pq_begintypsend(&buf); - pq_sendint64(&buf, (uint64) U64FromFullTransactionId(arg1)); + pq_sendint64(&buf, (uint64) XidFromFullTransactionId(arg1)); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } diff --git a/src/backend/utils/adt/xid8funcs.c b/src/backend/utils/adt/xid8funcs.c index d8e40b3b96..6257d985d6 100644 --- a/src/backend/utils/adt/xid8funcs.c +++ b/src/backend/utils/adt/xid8funcs.c @@ -80,8 +80,7 @@ typedef struct * It is an ERROR if the xid is in the future. Otherwise, returns true if * the transaction is still new enough that we can determine whether it * committed and false otherwise. If *extracted_xid is not NULL, it is set - * to the low 32 bits of the transaction ID (i.e. the actual XID, without the - * epoch). + * to the actual transaction ID. * * The caller must hold XactTruncationLock since it's dealing with arbitrary * XIDs, and must continue to hold it until it's done with any clog lookups @@ -90,15 +89,10 @@ typedef struct static bool TransactionIdInRecentPast(FullTransactionId fxid, TransactionId *extracted_xid) { - uint32 xid_epoch = EpochFromFullTransactionId(fxid); TransactionId xid = XidFromFullTransactionId(fxid); - uint32 now_epoch; - TransactionId now_epoch_next_xid; FullTransactionId now_fullxid; now_fullxid = ReadNextFullTransactionId(); - now_epoch_next_xid = XidFromFullTransactionId(now_fullxid); - now_epoch = EpochFromFullTransactionId(now_fullxid); if (extracted_xid != NULL) *extracted_xid = xid; @@ -115,7 +109,7 @@ TransactionIdInRecentPast(FullTransactionId fxid, TransactionId *extracted_xid) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("transaction ID %llu is in the future", - (unsigned long long) U64FromFullTransactionId(fxid)))); + (unsigned long long) XidFromFullTransactionId(fxid)))); /* * ShmemVariableCache->oldestClogXid is protected by XactTruncationLock, @@ -127,48 +121,15 @@ TransactionIdInRecentPast(FullTransactionId fxid, TransactionId *extracted_xid) Assert(LWLockHeldByMe(XactTruncationLock)); /* - * If the transaction ID has wrapped around, it's definitely too old to - * determine the commit status. Otherwise, we can compare it to - * ShmemVariableCache->oldestClogXid to determine whether the relevant - * CLOG entry is guaranteed to still exist. + * We compare xid to ShmemVariableCache->oldestClogXid to determine + * whether the relevant CLOG entry is guaranteed to still exist. */ - if (xid_epoch + 1 < now_epoch - || (xid_epoch + 1 == now_epoch && xid < now_epoch_next_xid) - || TransactionIdPrecedes(xid, ShmemVariableCache->oldestClogXid)) + if (TransactionIdPrecedes(xid, ShmemVariableCache->oldestClogXid)) return false; return true; } -/* - * Convert a TransactionId obtained from a snapshot held by the caller to a - * FullTransactionId. Use next_fxid as a reference FullTransactionId, so that - * we can compute the high order bits. It must have been obtained by the - * caller with ReadNextFullTransactionId() after the snapshot was created. - */ -static FullTransactionId -widen_snapshot_xid(TransactionId xid, FullTransactionId next_fxid) -{ - TransactionId next_xid = XidFromFullTransactionId(next_fxid); - uint32 epoch = EpochFromFullTransactionId(next_fxid); - - /* Special transaction ID. */ - if (!TransactionIdIsNormal(xid)) - return FullTransactionIdFromEpochAndXid(0, xid); - - /* - * The 64 bit result must be <= next_fxid, since next_fxid hadn't been - * issued yet when the snapshot was created. Every TransactionId in the - * snapshot must therefore be from the same epoch as next_fxid, or the - * epoch before. We know this because next_fxid is never allow to get - * more than one epoch ahead of the TransactionIds in any snapshot. - */ - if (xid > next_xid) - epoch--; - - return FullTransactionIdFromEpochAndXid(epoch, xid); -} - /* * txid comparator for qsort/bsearch */ @@ -295,12 +256,12 @@ parse_snapshot(const char *str) char *endp; StringInfo buf; - xmin = FullTransactionIdFromU64(strtou64(str, &endp, 10)); + xmin = FullTransactionIdFromXid(strtou64(str, &endp, 10)); if (*endp != ':') goto bad_format; str = endp + 1; - xmax = FullTransactionIdFromU64(strtou64(str, &endp, 10)); + xmax = FullTransactionIdFromXid(strtou64(str, &endp, 10)); if (*endp != ':') goto bad_format; str = endp + 1; @@ -318,7 +279,7 @@ parse_snapshot(const char *str) while (*str != '\0') { /* read next value */ - val = FullTransactionIdFromU64(strtou64(str, &endp, 10)); + val = FullTransactionIdFromXid(strtou64(str, &endp, 10)); str = endp; /* require the input to be in order */ @@ -397,7 +358,6 @@ pg_current_snapshot(PG_FUNCTION_ARGS) uint32 nxip, i; Snapshot cur; - FullTransactionId next_fxid = ReadNextFullTransactionId(); cur = GetActiveSnapshot(); if (cur == NULL) @@ -415,11 +375,11 @@ pg_current_snapshot(PG_FUNCTION_ARGS) snap = palloc(PG_SNAPSHOT_SIZE(nxip)); /* fill */ - snap->xmin = widen_snapshot_xid(cur->xmin, next_fxid); - snap->xmax = widen_snapshot_xid(cur->xmax, next_fxid); + snap->xmin = FullTransactionIdFromXid(cur->xmin); + snap->xmax = FullTransactionIdFromXid(cur->xmax); snap->nxip = nxip; for (i = 0; i < nxip; i++) - snap->xip[i] = widen_snapshot_xid(cur->xip[i], next_fxid); + snap->xip[i] = FullTransactionIdFromXid(cur->xip[i]); /* * We want them guaranteed to be in ascending order. This also removes @@ -467,16 +427,16 @@ pg_snapshot_out(PG_FUNCTION_ARGS) initStringInfo(&str); appendStringInfo(&str, UINT64_FORMAT ":", - U64FromFullTransactionId(snap->xmin)); + XidFromFullTransactionId(snap->xmin)); appendStringInfo(&str, UINT64_FORMAT ":", - U64FromFullTransactionId(snap->xmax)); + XidFromFullTransactionId(snap->xmax)); for (i = 0; i < snap->nxip; i++) { if (i > 0) appendStringInfoChar(&str, ','); appendStringInfo(&str, UINT64_FORMAT, - U64FromFullTransactionId(snap->xip[i])); + XidFromFullTransactionId(snap->xip[i])); } PG_RETURN_CSTRING(str.data); @@ -505,8 +465,8 @@ pg_snapshot_recv(PG_FUNCTION_ARGS) if (nxip < 0 || nxip > PG_SNAPSHOT_MAX_NXIP) goto bad_format; - xmin = FullTransactionIdFromU64((uint64) pq_getmsgint64(buf)); - xmax = FullTransactionIdFromU64((uint64) pq_getmsgint64(buf)); + xmin = FullTransactionIdFromXid((uint64) pq_getmsgint64(buf)); + xmax = FullTransactionIdFromXid((uint64) pq_getmsgint64(buf)); if (!FullTransactionIdIsValid(xmin) || !FullTransactionIdIsValid(xmax) || FullTransactionIdPrecedes(xmax, xmin)) @@ -519,7 +479,7 @@ pg_snapshot_recv(PG_FUNCTION_ARGS) for (i = 0; i < nxip; i++) { FullTransactionId cur = - FullTransactionIdFromU64((uint64) pq_getmsgint64(buf)); + FullTransactionIdFromXid((uint64) pq_getmsgint64(buf)); if (FullTransactionIdPrecedes(cur, last) || FullTransactionIdPrecedes(cur, xmin) || @@ -564,10 +524,10 @@ pg_snapshot_send(PG_FUNCTION_ARGS) pq_begintypsend(&buf); pq_sendint32(&buf, snap->nxip); - pq_sendint64(&buf, (int64) U64FromFullTransactionId(snap->xmin)); - pq_sendint64(&buf, (int64) U64FromFullTransactionId(snap->xmax)); + pq_sendint64(&buf, (int64) XidFromFullTransactionId(snap->xmin)); + pq_sendint64(&buf, (int64) XidFromFullTransactionId(snap->xmax)); for (i = 0; i < snap->nxip; i++) - pq_sendint64(&buf, (int64) U64FromFullTransactionId(snap->xip[i])); + pq_sendint64(&buf, (int64) XidFromFullTransactionId(snap->xip[i])); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -655,8 +615,7 @@ pg_snapshot_xip(PG_FUNCTION_ARGS) * Report the status of a recent transaction ID, or null for wrapped, * truncated away or otherwise too old XIDs. * - * The passed epoch-qualified xid is treated as a normal xid, not a - * multixact id. + * The passed xid is treated as a normal xid, not a multixact id. * * If it points to a committed subxact the result is the subxact status even * though the parent xact may still be in progress or may have aborted. diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 38e943fab2..b69aa01cf9 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -1839,6 +1839,7 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp, Datum *arguments, memcpy((char *) ct->tuple.t_data, (const char *) dtp->t_data, dtp->t_len); + HeapTupleCopyBase(&ct->tuple, dtp); MemoryContextSwitchTo(oldcxt); if (dtp != ntp) diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 00dc0f2403..233530cf35 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -2307,8 +2307,7 @@ RelationReloadIndexInfo(Relation relation) relation->rd_index->indislive = index->indislive; /* Copy xmin too, as that is needed to make sense of indcheckxmin */ - HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data, - HeapTupleHeaderGetXmin(tuple->t_data)); + HeapTupleSetXmin(relation->rd_indextuple, HeapTupleGetXmin(tuple)); ReleaseSysCache(tuple); } diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index a9dd068095..34fe6bea83 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -540,7 +540,7 @@ lookup_C_func(HeapTuple procedureTuple) NULL); if (entry == NULL) return NULL; /* no such entry */ - if (entry->fn_xmin == HeapTupleHeaderGetRawXmin(procedureTuple->t_data) && + if (entry->fn_xmin == HeapTupleGetRawXmin(procedureTuple) && ItemPointerEquals(&entry->fn_tid, &procedureTuple->t_self)) return entry; /* OK */ return NULL; /* entry is out of date */ @@ -576,7 +576,7 @@ record_C_func(HeapTuple procedureTuple, HASH_ENTER, &found); /* OID is already filled in */ - entry->fn_xmin = HeapTupleHeaderGetRawXmin(procedureTuple->t_data); + entry->fn_xmin = HeapTupleGetRawXmin(procedureTuple); entry->fn_tid = procedureTuple->t_self; entry->user_fn = user_fn; entry->inforec = inforec; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 12a1f30f7c..1414b1aef2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -857,6 +857,14 @@ build_guc_variables(void) num_vars++; } + for (i = 0; ConfigureNamesInt64[i].gen.name; i++) + { + struct config_int64 *conf = &ConfigureNamesInt64[i]; + + conf->gen.vartype = PGC_INT64; + num_vars++; + } + for (i = 0; ConfigureNamesReal[i].gen.name; i++) { struct config_real *conf = &ConfigureNamesReal[i]; @@ -897,6 +905,9 @@ build_guc_variables(void) for (i = 0; ConfigureNamesInt[i].gen.name; i++) guc_vars[num_vars++] = &ConfigureNamesInt[i].gen; + for (i = 0; ConfigureNamesInt64[i].gen.name; i++) + guc_vars[num_vars++] = &ConfigureNamesInt64[i].gen; + for (i = 0; ConfigureNamesReal[i].gen.name; i++) guc_vars[num_vars++] = &ConfigureNamesReal[i].gen; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 8869eb4112..5a7ef089f4 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2438,74 +2438,6 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, - { - {"vacuum_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Minimum age at which VACUUM should freeze a table row."), - NULL - }, - &vacuum_freeze_min_age, - 50000000, 0, 1000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Age at which VACUUM should scan whole table to freeze tuples."), - NULL - }, - &vacuum_freeze_table_age, - 150000000, 0, 2000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_multixact_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Minimum age at which VACUUM should freeze a MultiXactId in a table row."), - NULL - }, - &vacuum_multixact_freeze_min_age, - 5000000, 0, 1000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_multixact_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Multixact age at which VACUUM should scan whole table to freeze tuples."), - NULL - }, - &vacuum_multixact_freeze_table_age, - 150000000, 0, 2000000000, - NULL, NULL, NULL - }, - - { - {"vacuum_defer_cleanup_age", PGC_SIGHUP, REPLICATION_PRIMARY, - gettext_noop("Number of transactions by which VACUUM and HOT cleanup should be deferred, if any."), - NULL - }, - &vacuum_defer_cleanup_age, - 0, 0, 1000000, /* see ComputeXidHorizons */ - NULL, NULL, NULL - }, - { - {"vacuum_failsafe_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Age at which VACUUM should trigger failsafe to avoid a wraparound outage."), - NULL - }, - &vacuum_failsafe_age, - 1600000000, 0, 2100000000, - NULL, NULL, NULL - }, - { - {"vacuum_multixact_failsafe_age", PGC_USERSET, CLIENT_CONN_STATEMENT, - gettext_noop("Multixact age at which VACUUM should trigger failsafe to avoid a wraparound outage."), - NULL - }, - &vacuum_multixact_failsafe_age, - 1600000000, 0, 2100000000, - NULL, NULL, NULL - }, - /* * See also CheckRequiredParameterValues() if this parameter changes */ @@ -3116,28 +3048,6 @@ struct config_int ConfigureNamesInt[] = 50, 0, INT_MAX, NULL, NULL, NULL }, - { - /* see varsup.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ - {"autovacuum_freeze_max_age", PGC_POSTMASTER, AUTOVACUUM, - gettext_noop("Age at which to autovacuum a table to prevent transaction ID wraparound."), - NULL - }, - &autovacuum_freeze_max_age, - - /* see vacuum_failsafe_age if you change the upper-limit value. */ - 200000000, 100000, 2000000000, - NULL, NULL, NULL - }, - { - /* see multixact.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ - {"autovacuum_multixact_freeze_max_age", PGC_POSTMASTER, AUTOVACUUM, - gettext_noop("Multixact age at which to autovacuum a table to prevent multixact wraparound."), - NULL - }, - &autovacuum_multixact_freeze_max_age, - 400000000, 10000, 2000000000, - NULL, NULL, NULL - }, { /* see max_connections */ {"autovacuum_max_workers", PGC_POSTMASTER, AUTOVACUUM, @@ -3415,6 +3325,96 @@ struct config_int ConfigureNamesInt[] = struct config_int64 ConfigureNamesInt64[] = { + { + {"vacuum_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Minimum age at which VACUUM should freeze a table row."), + NULL + }, + &vacuum_freeze_min_age, + INT64CONST(50000000), INT64CONST(0), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + + { + {"vacuum_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Age at which VACUUM should scan whole table to freeze tuples."), + NULL + }, + &vacuum_freeze_table_age, + INT64CONST(150000000), INT64CONST(0), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + + { + {"vacuum_multixact_freeze_min_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Minimum age at which VACUUM should freeze a MultiXactId in a table row."), + NULL + }, + &vacuum_multixact_freeze_min_age, + INT64CONST(5000000), INT64CONST(0), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + + { + {"vacuum_multixact_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Multixact age at which VACUUM should scan whole table to freeze tuples."), + NULL + }, + &vacuum_multixact_freeze_table_age, + INT64CONST(150000000), INT64CONST(0), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + + { + {"vacuum_defer_cleanup_age", PGC_SIGHUP, REPLICATION_PRIMARY, + gettext_noop("Number of transactions by which VACUUM and HOT cleanup should be deferred, if any."), + NULL + }, + &vacuum_defer_cleanup_age, + INT64CONST(0), INT64CONST(0), INT64CONST(1000000), /* see ComputeXidHorizons */ + NULL, NULL, NULL + }, + { + {"vacuum_failsafe_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Age at which VACUUM should trigger failsafe to avoid a wraparound outage."), + NULL + }, + &vacuum_failsafe_age, + INT64CONST(1600000000), INT64CONST(0), INT64CONST(2100000000), + NULL, NULL, NULL + }, + { + {"vacuum_multixact_failsafe_age", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Multixact age at which VACUUM should trigger failsafe to avoid a wraparound outage."), + NULL + }, + &vacuum_multixact_failsafe_age, + INT64CONST(1600000000), INT64CONST(0), INT64CONST(2100000000), + NULL, NULL, NULL + }, + { + /* see varsup.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ + {"autovacuum_freeze_max_age", PGC_POSTMASTER, AUTOVACUUM, + gettext_noop("Age at which to autovacuum a table to prevent transaction ID wraparound."), + NULL + }, + &autovacuum_freeze_max_age, + + /* see vacuum_failsafe_age if you change the upper-limit value. */ + INT64CONST(10000000000), INT64CONST(100000), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + { + /* see multixact.c for why this is PGC_POSTMASTER not PGC_SIGHUP */ + {"autovacuum_multixact_freeze_max_age", PGC_POSTMASTER, AUTOVACUUM, + gettext_noop("Multixact age at which to autovacuum a table to prevent multixact wraparound."), + NULL + }, + &autovacuum_multixact_freeze_max_age, + INT64CONST(20000000000), INT64CONST(10000), INT64CONST(0x7FFFFFFFFFFFFFFF), + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL diff --git a/src/backend/utils/misc/help_config.c b/src/backend/utils/misc/help_config.c index 61c83f3590..19a316ec99 100644 --- a/src/backend/utils/misc/help_config.c +++ b/src/backend/utils/misc/help_config.c @@ -33,6 +33,7 @@ typedef union struct config_bool _bool; struct config_real real; struct config_int integer; + struct config_int64 integer8; struct config_string string; struct config_enum _enum; } mixedStruct; @@ -107,7 +108,12 @@ printMixedStruct(mixedStruct *structToPrint) structToPrint->integer.min, structToPrint->integer.max); break; - + case PGC_INT64: + printf("INT64\t%lld\t%lld\t%lld\t", + (long long) structToPrint->integer8.reset_val, + (long long) structToPrint->integer8.min, + (long long) structToPrint->integer8.max); + break; case PGC_REAL: printf("REAL\t%g\t%g\t%g\t", structToPrint->real.reset_val, diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 4ab4a0a701..ffffdeccbb 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -165,7 +165,7 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) nulls[5] = false; values[6] = CStringGetTextDatum(psprintf("%llu", - (unsigned long long) U64FromFullTransactionId(ControlFile->checkPointCopy.nextXid))); + (unsigned long long) XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid))); nulls[6] = false; values[7] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid); diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 868d21c351..d51f1d1863 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -647,9 +647,9 @@ #autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of inserts over table # size before insert vacuum #autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze -#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum +#autovacuum_freeze_max_age = 10000000000 # maximum XID age before forced vacuum # (change requires restart) -#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age +#autovacuum_multixact_freeze_max_age = 20000000000 # maximum multixact age # before forced vacuum # (change requires restart) #autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index afa5bdbf04..3ee2d869e6 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -1163,11 +1163,16 @@ writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup) { TuplesortPublic *base = TuplesortstateGetPublic(state); HeapTuple tuple = (HeapTuple) stup->tuple; - unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + unsigned int tuplen = tuple->t_len + + sizeof(ItemPointerData) + + 2 * sizeof(TransactionId) + /* tuple xmin, xmax */ + sizeof(int); /* We need to store t_self, but not other fields of HeapTupleData */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(tape, &tuple->t_xmin, sizeof(TransactionId)); + LogicalTapeWrite(tape, &tuple->t_xmax, sizeof(TransactionId)); LogicalTapeWrite(tape, tuple->t_data, tuple->t_len); if (base->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length word? */ LogicalTapeWrite(tape, &tuplen, sizeof(tuplen)); @@ -1179,7 +1184,10 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup, { TuplesortPublic *base = TuplesortstateGetPublic(state); TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg; - unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + unsigned int t_len = tuplen - + sizeof(ItemPointerData) - + 2 * sizeof(TransactionId) - /* tuple xmin, xmax */ + sizeof(int); HeapTuple tuple = (HeapTuple) tuplesort_readtup_alloc(state, t_len + HEAPTUPLESIZE); @@ -1187,6 +1195,8 @@ readtup_cluster(Tuplesortstate *state, SortTuple *stup, tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); tuple->t_len = t_len; LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeReadExact(tape, &tuple->t_xmin, sizeof(TransactionId)); + LogicalTapeReadExact(tape, &tuple->t_xmax, sizeof(TransactionId)); /* We don't currently bother to reconstruct t_tableOid */ tuple->t_tableOid = InvalidOid; /* Read in the tuple body */ diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c index 6613dc0534..f673624f68 100644 --- a/src/backend/utils/time/combocid.c +++ b/src/backend/utils/time/combocid.c @@ -101,12 +101,13 @@ static CommandId GetRealCmax(CommandId combocid); */ CommandId -HeapTupleHeaderGetCmin(HeapTupleHeader tup) +HeapTupleGetCmin(HeapTuple tuple) { + HeapTupleHeader tup = tuple->t_data; CommandId cid = HeapTupleHeaderGetRawCommandId(tup); Assert(!(tup->t_infomask & HEAP_MOVED)); - Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tup))); + Assert(TransactionIdIsCurrentTransactionId(HeapTupleGetXmin(tuple))); if (tup->t_infomask & HEAP_COMBOCID) return GetRealCmin(cid); @@ -115,8 +116,9 @@ HeapTupleHeaderGetCmin(HeapTupleHeader tup) } CommandId -HeapTupleHeaderGetCmax(HeapTupleHeader tup) +HeapTupleGetCmax(HeapTuple tuple) { + HeapTupleHeader tup = tuple->t_data; CommandId cid = HeapTupleHeaderGetRawCommandId(tup); Assert(!(tup->t_infomask & HEAP_MOVED)); @@ -128,7 +130,7 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup) * things too much. */ Assert(CritSectionCount > 0 || - TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tup))); + TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXidAny(tuple))); if (tup->t_infomask & HEAP_COMBOCID) return GetRealCmax(cid); @@ -150,7 +152,7 @@ HeapTupleHeaderGetCmax(HeapTupleHeader tup) * changes the tuple in shared buffers. */ void -HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, +HeapTupleHeaderAdjustCmax(HeapTuple tup, CommandId *cmax, bool *iscombo) { @@ -160,10 +162,10 @@ HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, * Test for HeapTupleHeaderXminCommitted() first, because it's cheaper * than a TransactionIdIsCurrentTransactionId call. */ - if (!HeapTupleHeaderXminCommitted(tup) && - TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tup))) + if (!HeapTupleHeaderXminCommitted(tup->t_data) && + TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(tup))) { - CommandId cmin = HeapTupleHeaderGetCmin(tup); + CommandId cmin = HeapTupleGetCmin(tup); *cmax = GetComboCommandId(cmin, *cmax); *iscombo = true; diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index f76bab65cb..2be450529f 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -950,15 +950,15 @@ SnapshotResetXmin(void) if (pairingheap_is_empty(&RegisteredSnapshots)) { - MyProc->xmin = InvalidTransactionId; + pg_atomic_write_u64(&MyProc->xmin, InvalidTransactionId); return; } minSnapshot = pairingheap_container(SnapshotData, ph_node, pairingheap_first(&RegisteredSnapshots)); - if (TransactionIdPrecedes(MyProc->xmin, minSnapshot->xmin)) - MyProc->xmin = minSnapshot->xmin; + if (TransactionIdPrecedes(pg_atomic_read_u64(&MyProc->xmin), minSnapshot->xmin)) + pg_atomic_write_u64(&MyProc->xmin, minSnapshot->xmin); } /* @@ -1111,7 +1111,7 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) if (resetXmin) SnapshotResetXmin(); - Assert(resetXmin || MyProc->xmin == 0); + Assert(resetXmin || pg_atomic_read_u64(&MyProc->xmin) == 0); } @@ -1176,8 +1176,9 @@ ExportSnapshot(Snapshot snapshot) * Generate file path for the snapshot. We start numbering of snapshots * inside the transaction from 1. */ - snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%08X-%d", - MyProc->backendId, MyProc->lxid, list_length(exportedSnapshots) + 1); + snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%08X%08X-%d", + MyProc->backendId, (uint32) (MyProc->lxid >> 32), + (uint32) MyProc->lxid, list_length(exportedSnapshots) + 1); /* * Copy the snapshot into TopTransactionContext, add it to the @@ -1353,7 +1354,7 @@ parseXidFromText(const char *prefix, char **s, const char *filename) (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); ptr += prefixlen; - if (sscanf(ptr, "%u", &val) != 1) + if (sscanf(ptr, "%" INT64_MODIFIER "u", &val) != 1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); @@ -1378,7 +1379,7 @@ parseVxidFromText(const char *prefix, char **s, const char *filename, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); ptr += prefixlen; - if (sscanf(ptr, "%d/%u", &vxid->backendId, &vxid->localTransactionId) != 2) + if (sscanf(ptr, "%d/%" INT64_MODIFIER "u", &vxid->backendId, &vxid->localTransactionId) != 2) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); @@ -1837,7 +1838,7 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, */ if (old_snapshot_threshold == 0) { - if (TransactionIdPrecedes(latest_xmin, MyProc->xmin) + if (TransactionIdPrecedes(latest_xmin, pg_atomic_read_u64(&MyProc->xmin)) && TransactionIdFollows(latest_xmin, xlimit)) xlimit = latest_xmin; @@ -2321,7 +2322,7 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) if (!snapshot->suboverflowed) { /* we have full data, so search subxip */ - if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + if (pg_lfind64(xid, snapshot->subxip, snapshot->subxcnt)) return true; /* not there, fall through to search xip[] */ @@ -2343,7 +2344,7 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) return false; } - if (pg_lfind32(xid, snapshot->xip, snapshot->xcnt)) + if (pg_lfind64(xid, snapshot->xip, snapshot->xcnt)) return true; } else @@ -2377,7 +2378,7 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ - if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt)) + if (pg_lfind64(xid, snapshot->subxip, snapshot->subxcnt)) return true; } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c56e1ac86c..f2812e1a1e 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -1305,7 +1305,7 @@ bootstrap_template1(void) escape_quotes_bki(username)); /* relfrozenxid must not be less than FirstNormalTransactionId */ - sprintf(buf, "%u", Max(start_xid, 3)); + sprintf(buf, "%llu", (unsigned long long) Max(start_xid, 3)); bki_lines = replace_token(bki_lines, "RECENTXMIN", buf); @@ -1328,13 +1328,13 @@ bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -X %d %s %s %u %s %u %s %u %s %s %s", + "\"%s\" --boot -X %d %s %s %llu %s %llu %s %llu %s %s %s", backend_exec, wal_segment_size_mb * (1024 * 1024), data_checksums ? "-k" : "", - "-m", start_mxid, - "-o", start_mxoff, - "-x", start_xid, + "-m", (unsigned long long) start_mxid, + "-o", (unsigned long long) start_mxoff, + "-x", (unsigned long long) start_xid, boot_options, extra_options, debug ? "-d 5" : ""); @@ -2177,15 +2177,18 @@ usage(const char *progname) printf(_(" --discard-caches set debug_discard_caches=1\n")); printf(_(" -L DIRECTORY where to find the input files\n")); printf(_(" -m, --multixact-id=START_MXID\n" - " set initial database cluster multixact id\n")); + " set initial database cluster multixact id\n" + " max value is 2^62-1\n")); printf(_(" -n, --no-clean do not clean up after errors\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); printf(_(" --no-instructions do not print instructions for next steps\n")); printf(_(" -o, --multixact-offset=START_MXOFF\n" - " set initial database cluster multixact offset\n")); + " set initial database cluster multixact offset\n" + " max value is 2^62-1")); printf(_(" -s, --show show internal settings\n")); printf(_(" -S, --sync-only only sync database files to disk, then exit\n")); - printf(_(" -x, --xid=START_XID set initial database cluster xid\n")); + printf(_(" -x, --xid=START_XID set initial database cluster xid\n" + " max value is 2^62-1\n")); printf(_("\nOther options:\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -2723,13 +2726,16 @@ initialize_data_directory(void) setup_config(); if (start_mxid != 0) - printf(_("selecting initial multixact id ... %u\n"), start_mxid); + printf(_("selecting initial multixact id ... %llu\n"), + (unsigned long long) start_mxid); if (start_mxoff != 0) - printf(_("selecting initial multixact offset ... %u\n"), start_mxoff); + printf(_("selecting initial multixact offset ... %llu\n"), + (unsigned long long) start_mxoff); if (start_xid != 0) - printf(_("selecting initial xid ... %u\n"), start_xid); + printf(_("selecting initial xid ... %llu\n"), + (unsigned long long) start_xid); /* Bootstrap template1 */ bootstrap_template1(); @@ -2747,11 +2753,11 @@ initialize_data_directory(void) fflush(stdout); snprintf(cmd, sizeof(cmd), - "\"%s\" %s %s %s %u %s %u %s %u template1 >%s", + "\"%s\" %s %s %s %llu %s %llu %s %llu template1 >%s", backend_exec, backend_options, extra_options, - "-m", start_mxid, - "-o", start_mxoff, - "-x", start_xid, + "-m", (unsigned long long) start_mxid, + "-o", (unsigned long long) start_mxoff, + "-x", (unsigned long long) start_xid, DEVNULL); PG_CMD_OPEN; @@ -2918,15 +2924,13 @@ main(int argc, char *argv[]) break; case 'm': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxid = value; + start_mxid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxid) /* overflow */ + !StartMultiXactIdIsValid(start_mxid)) { pg_log_error("invalid initial database cluster multixact id"); exit(1); @@ -2951,15 +2955,13 @@ main(int argc, char *argv[]) break; case 'o': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_mxoff = value; + start_mxoff = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_mxoff) /* overflow */ + !StartMultiXactOffsetIsValid(start_mxoff)) { pg_log_error("invalid initial database cluster multixact offset"); exit(1); @@ -3038,15 +3040,13 @@ main(int argc, char *argv[]) break; case 'x': { - unsigned long value; - char *endptr; + char *endptr; errno = 0; - value = strtoul(optarg, &endptr, 0); - start_xid = value; + start_xid = strtoull(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0 || - value != start_xid) /* overflow */ + !StartTransactionIdIsValid(start_xid)) { pg_log_error("invalid value for initial database cluster xid"); exit(1); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 57a68091d4..d595e45bd3 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -149,28 +149,28 @@ command_fails( # Set non-standard initial mxid/mxoff/xid. command_fails_like( - [ 'initdb', '-m', '4294967296', $datadir ], + [ 'initdb', '-m', '9223372036854775807', $datadir ], qr/initdb: error: invalid initial database cluster multixact id/, 'fails for invalid initial database cluster multixact id'); command_fails_like( - [ 'initdb', '-o', '4294967296', $datadir ], + [ 'initdb', '-o', '9223372036854775807', $datadir ], qr/initdb: error: invalid initial database cluster multixact offset/, 'fails for invalid initial database cluster multixact offset'); command_fails_like( - [ 'initdb', '-x', '4294967296', $datadir ], + [ 'initdb', '-x', '9223372036854775807', $datadir ], qr/initdb: error: invalid value for initial database cluster xid/, 'fails for invalid initial database cluster xid'); command_fails_like( - [ 'initdb', '-m', '0x100000000', $datadir ], + [ 'initdb', '-m', '0x10000000000000000', $datadir ], qr/initdb: error: invalid initial database cluster multixact id/, 'fails for invalid initial database cluster multixact id'); command_fails_like( - [ 'initdb', '-o', '0x100000000', $datadir ], + [ 'initdb', '-o', '0x10000000000000000', $datadir ], qr/initdb: error: invalid initial database cluster multixact offset/, 'fails for invalid initial database cluster multixact offset'); command_fails_like( - [ 'initdb', '-x', '0x100000000', $datadir ], + [ 'initdb', '-x', '0x10000000000000000', $datadir ], qr/initdb: error: invalid value for initial database cluster xid/, 'fails for invalid initial database cluster xid'); diff --git a/src/bin/pg_amcheck/t/004_verify_heapam.pl b/src/bin/pg_amcheck/t/004_verify_heapam.pl index 8050811105..6563eb798b 100644 --- a/src/bin/pg_amcheck/t/004_verify_heapam.pl +++ b/src/bin/pg_amcheck/t/004_verify_heapam.pl @@ -9,6 +9,8 @@ use PostgreSQL::Test::Utils; use Test::More; +use Data::Dumper; + # This regression test demonstrates that the pg_amcheck binary correctly # identifies specific kinds of corruption within pages. To test this, we need # a mechanism to create corrupt pages with predictable, repeatable corruption. @@ -85,6 +87,62 @@ use Test::More; use constant HEAPTUPLE_PACK_CODE => 'LLLSSSSSCCLLCCCCCCCCCCllLL'; use constant HEAPTUPLE_PACK_LENGTH => 58; # Total size +use constant HEAPPAGE_SPECIAL_PACK_CODE => 'QQ'; +use constant HEAPPAGE_SPECIAL_PACK_LENGTH => 16; +use constant HEAPPAGE_SIZE => 8192; + +# Some #define constants from access/htup_details.h for use while corrupting. +use constant HEAP_HASNULL => 0x0001; +use constant HEAP_XMAX_LOCK_ONLY => 0x0080; +use constant HEAP_XMIN_COMMITTED => 0x0100; +use constant HEAP_XMIN_INVALID => 0x0200; +use constant HEAP_XMAX_COMMITTED => 0x0400; +use constant HEAP_XMAX_INVALID => 0x0800; +use constant HEAP_NATTS_MASK => 0x07FF; +use constant HEAP_XMAX_IS_MULTI => 0x1000; +use constant HEAP_KEYS_UPDATED => 0x2000; + +use constant FIRST_NORMAL_TRANSACTION_ID => 3; + +# Read page special data +sub read_special_data +{ + my ($fh, $offset) = @_; + my ($buffer, %special); + + $offset -= $offset % HEAPPAGE_SIZE; + $offset += HEAPPAGE_SIZE - HEAPPAGE_SPECIAL_PACK_LENGTH; + + sysseek($fh, $offset, 0) + or BAIL_OUT("sysseek failed: $!"); + defined(sysread($fh, $buffer, HEAPPAGE_SPECIAL_PACK_LENGTH)) + or BAIL_OUT("sysread failed: $!"); + + @_ = unpack(HEAPPAGE_SPECIAL_PACK_CODE, $buffer); + %special = ( + pd_xid_base => shift, + pd_multi_base => shift); + return \%special; +} + +# Write page special data +sub write_special_data +{ + my ($fh, $offset, $special) = @_; + + $offset -= $offset % HEAPPAGE_SIZE; + $offset += HEAPPAGE_SIZE - HEAPPAGE_SPECIAL_PACK_LENGTH; + + my $buffer = pack( + HEAPPAGE_SPECIAL_PACK_CODE, + $special->{pd_xid_base}, $special->{pd_multi_base}); + + sysseek($fh, $offset, 0) + or BAIL_OUT("sysseek failed: $!"); + defined(syswrite($fh, $buffer, HEAPPAGE_SPECIAL_PACK_LENGTH)) + or BAIL_OUT("syswrite failed: $!"); + return; +} # Read a tuple of our table from a heap page. # @@ -96,8 +154,9 @@ use constant HEAPTUPLE_PACK_LENGTH => 58; # Total size # sub read_tuple { - my ($fh, $offset) = @_; + my ($fh, $offset, $raw) = @_; my ($buffer, %tup); + sysseek($fh, $offset, 0) or BAIL_OUT("sysseek failed: $!"); defined(sysread($fh, $buffer, HEAPTUPLE_PACK_LENGTH)) @@ -133,6 +192,18 @@ sub read_tuple c_va_toastrelid => shift); # Stitch together the text for column 'b' $tup{b} = join('', map { chr($tup{"b_body$_"}) } (1 .. 7)); + + if (!$raw) + { + my $special = read_special_data($fh, $offset); + + $tup{t_xmin} += $special->{pd_xid_base}; + my $is_multi = $tup{t_infomask} & HEAP_XMAX_IS_MULTI; + $tup{t_xmax} += !$is_multi ? + $special->{pd_xid_base} : + $special->{pd_multi_base}; + } + return \%tup; } @@ -148,7 +219,33 @@ sub read_tuple # sub write_tuple { - my ($fh, $offset, $tup) = @_; + my ($fh, $offset, $tup, $raw) = @_; + + if (!$raw) + { + my $special = read_special_data($fh, $offset); + + my $xmin = $tup->{t_xmin} - $special->{pd_xid_base}; + die "tuple x_min $tup->{t_xmin} is too smal for pd_xid_base $special->{pd_xid_base}" + if $xmin < 3; + $tup->{t_xmin} = $xmin; + + if (($tup->{t_infomask} & HEAP_XMAX_IS_MULTI) == 0) + { + my $xmax = $tup->{t_xmax} - $special->{pd_xid_base}; + die "tuple x_max $tup->{t_xmax} is too smal for pd_xid_base $special->{pd_xid_base}" + if $xmax < 3; + $tup->{t_xmax} = $xmax; + } + else + { + my $xmax = $tup->{t_xmax} - $special->{pd_multi_base}; + die "tuple multi x_max $tup->{t_xmax} is too smal for pd_multi_base $special->{pd_multi_base}" + if $xmax < 3; + $tup->{t_xmax} = $xmax; + } + } + my $buffer = pack( HEAPTUPLE_PACK_CODE, $tup->{t_xmin}, $tup->{t_xmax}, @@ -171,6 +268,41 @@ sub write_tuple return; } +# move pd_xid_base and pd_multi_base to more suitable position for tests. +sub fixup_page +{ + my ($fh, $page, $xid_base, $multi_base, $lp_off) = @_; + my $offset = $page * HEAPPAGE_SIZE; + my $special = read_special_data($fh, $offset); + + die "xid_base $xid_base should be lesser than existed $special->{pd_xid_base}" + if ($xid_base > $special->{pd_xid_base}); + die "multi_base $multi_base should be lesser than existed $special->{pd_multi_base}" + if ($multi_base > $special->{pd_multi_base} && $special->{pd_multi_base} != 0); + return if ($xid_base == $special->{pd_xid_base} && + $multi_base == $special->{pd_multi_base}); + + my $xid_delta = $special->{pd_xid_base} - $xid_base; + my $multi_delta = $special->{pd_multi_base} - $multi_base; + + for my $off (@$lp_off) + { + # change only tuples on this page. + next if ($off < $offset && $off > $offset + HEAPPAGE_SIZE); + + my $tup = read_tuple($fh, $off, 1); + $tup->{t_xmin} += $xid_delta; + my $is_multi = $tup->{t_infomask} & HEAP_XMAX_IS_MULTI; + $tup->{t_xmax} += !$is_multi ? $xid_delta : $multi_delta; + write_tuple($fh, $off, $tup, 1); + } + + $special->{pd_xid_base} = $xid_base; + $special->{pd_multi_base} = $multi_base; + + write_special_data($fh, $offset, $special); +} + # Set umask so test directories and files are created with default permissions umask(0077); @@ -233,6 +365,10 @@ my $relfrozenxid = $node->safe_psql('postgres', q(select relfrozenxid from pg_class where relname = 'test')); my $datfrozenxid = $node->safe_psql('postgres', q(select datfrozenxid from pg_database where datname = 'postgres')); +my $datminmxid = $node->safe_psql('postgres', + q(select datminmxid from pg_database where datname = 'postgres')); +my $txid_current = $node->safe_psql('postgres', + q(select txid_current())); # Sanity check that our 'test' table has a relfrozenxid newer than the # datfrozenxid for the database, and that the datfrozenxid is greater than the @@ -263,7 +399,7 @@ select lp_off from heap_page_items(get_raw_page('test', 'main', 0)) $node->stop; my $file; open($file, '+<', $relpath) - or BAIL_OUT("open failed: $!"); + or BAIL_OUT("open failed: $!"); binmode $file; my $ENDIANNESS; @@ -291,8 +427,13 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) # Determine endianness of current platform from the 1-byte varlena header $ENDIANNESS = $tup->{b_header} == 0x11 ? "little" : "big"; } + +# Set 64bit xid bases a bit in the past therefore we can set xmin/xmax a bit +# in the past +fixup_page($file, 0, $datfrozenxid - 100, $datminmxid - 100, \@lp_off); + close($file) - or BAIL_OUT("close failed: $!"); + or BAIL_OUT("close failed: $!"); $node->start; # Ok, Xids and page layout look ok. We can run corruption tests. @@ -308,17 +449,6 @@ $node->command_ok([ 'pg_amcheck', '-p', $port, 'postgres' ], $node->stop; -# Some #define constants from access/htup_details.h for use while corrupting. -use constant HEAP_HASNULL => 0x0001; -use constant HEAP_XMAX_LOCK_ONLY => 0x0080; -use constant HEAP_XMIN_COMMITTED => 0x0100; -use constant HEAP_XMIN_INVALID => 0x0200; -use constant HEAP_XMAX_COMMITTED => 0x0400; -use constant HEAP_XMAX_INVALID => 0x0800; -use constant HEAP_NATTS_MASK => 0x07FF; -use constant HEAP_XMAX_IS_MULTI => 0x1000; -use constant HEAP_KEYS_UPDATED => 0x2000; - # Helper function to generate a regular expression matching the header we # expect verify_heapam() to return given which fields we expect to be non-null. sub header @@ -342,7 +472,7 @@ sub header # my @expected; open($file, '+<', $relpath) - or BAIL_OUT("open failed: $!"); + or BAIL_OUT("open failed: $!"); binmode $file; for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) @@ -367,7 +497,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) if ($offnum == 2) { # Corruptly set xmin < datfrozenxid - my $xmin = 3; + my $xmin = $datfrozenxid - 12; $tup->{t_xmin} = $xmin; $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; @@ -377,24 +507,24 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) } elsif ($offnum == 3) { - # Corruptly set xmin < datfrozenxid, further back, noting circularity - # of xid comparison. For a new cluster with epoch = 0, the corrupt - # xmin will be interpreted as in the future - $tup->{t_xmin} = 4026531839; + # Corruptly set xmin > next transaction id. + my $xmin = $relfrozenxid + 1000000; + $tup->{t_xmin} = $xmin; $tup->{t_infomask} &= ~HEAP_XMIN_COMMITTED; $tup->{t_infomask} &= ~HEAP_XMIN_INVALID; push @expected, - qr/${$header}xmin 4026531839 equals or exceeds next valid transaction ID \d+/; + qr/${$header}xmin $xmin equals or exceeds next valid transaction ID \d+/; } elsif ($offnum == 4) { - # Corruptly set xmax < relminmxid; - $tup->{t_xmax} = 4026531839; + # Corruptly set xmax > next transaction id. + my $xmax = $relfrozenxid + 1000000; + $tup->{t_xmax} = $xmax; $tup->{t_infomask} &= ~HEAP_XMAX_INVALID; push @expected, - qr/${$header}xmax 4026531839 equals or exceeds next valid transaction ID \d+/; + qr/${$header}xmax $xmax equals or exceeds next valid transaction ID \d+/; } elsif ($offnum == 5) { @@ -402,8 +532,8 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_hoff} += 128; push @expected, - qr/${$header}data begins at offset 152 beyond the tuple length 58/, - qr/${$header}tuple data should begin at byte 24, but actually begins at byte 152 \(3 attributes, no nulls\)/; + qr/${$header}data begins at offset 152 beyond the tuple length 58/, + qr/${$header}tuple data should begin at byte 24, but actually begins at byte 152 \(3 attributes, no nulls\)/; } elsif ($offnum == 6) { @@ -411,7 +541,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_hoff} += 3; push @expected, - qr/${$header}tuple data should begin at byte 24, but actually begins at byte 27 \(3 attributes, no nulls\)/; + qr/${$header}tuple data should begin at byte 24, but actually begins at byte 27 \(3 attributes, no nulls\)/; } elsif ($offnum == 7) { @@ -419,7 +549,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_hoff} -= 8; push @expected, - qr/${$header}tuple data should begin at byte 24, but actually begins at byte 16 \(3 attributes, no nulls\)/; + qr/${$header}tuple data should begin at byte 24, but actually begins at byte 16 \(3 attributes, no nulls\)/; } elsif ($offnum == 8) { @@ -427,7 +557,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_hoff} -= 3; push @expected, - qr/${$header}tuple data should begin at byte 24, but actually begins at byte 21 \(3 attributes, no nulls\)/; + qr/${$header}tuple data should begin at byte 24, but actually begins at byte 21 \(3 attributes, no nulls\)/; } elsif ($offnum == 9) { @@ -435,7 +565,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_infomask2} |= HEAP_NATTS_MASK; push @expected, - qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; } elsif ($offnum == 10) { @@ -447,7 +577,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_bits} = 0xAA; push @expected, - qr/${$header}tuple data should begin at byte 280, but actually begins at byte 24 \(2047 attributes, has nulls\)/; + qr/${$header}tuple data should begin at byte 280, but actually begins at byte 24 \(2047 attributes, has nulls\)/; } elsif ($offnum == 11) { @@ -458,7 +588,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $tup->{t_hoff} = 32; push @expected, - qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; } elsif ($offnum == 12) { @@ -482,7 +612,7 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) $header = header(0, $offnum, 1); push @expected, - qr/${header}attribute with length \d+ ends at offset \d+ beyond total tuple length \d+/; + qr/${header}attribute with length \d+ ends at offset \d+ beyond total tuple length \d+/; } elsif ($offnum == 13) { @@ -497,25 +627,27 @@ for (my $tupidx = 0; $tupidx < ROWCOUNT; $tupidx++) # Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI $tup->{t_infomask} |= HEAP_XMAX_COMMITTED; $tup->{t_infomask} |= HEAP_XMAX_IS_MULTI; - $tup->{t_xmax} = 4; + my $xmax = $datminmxid + 1000000; + $tup->{t_xmax} = $xmax; push @expected, - qr/${header}multitransaction ID 4 equals or exceeds next valid multitransaction ID 1/; + qr/${header}multitransaction ID $xmax equals or exceeds next valid multitransaction ID \d+/; } elsif ($offnum == 15) # Last offnum must equal ROWCOUNT { # Set both HEAP_XMAX_COMMITTED and HEAP_XMAX_IS_MULTI $tup->{t_infomask} |= HEAP_XMAX_COMMITTED; $tup->{t_infomask} |= HEAP_XMAX_IS_MULTI; - $tup->{t_xmax} = 4000000000; + my $xmax = $datminmxid - 10; + $tup->{t_xmax} = $xmax; push @expected, - qr/${header}multitransaction ID 4000000000 precedes relation minimum multitransaction ID threshold 1/; + qr/${header}multitransaction ID $xmax precedes relation minimum multitransaction ID threshold \d+/; } write_tuple($file, $offset, $tup); } close($file) - or BAIL_OUT("close failed: $!"); + or BAIL_OUT("close failed: $!"); $node->start; # Run pg_amcheck against the corrupt table with epoch=0, comparing actual diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index a8a46d5bf0..ffc89b3184 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -248,7 +248,7 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's full_page_writes: %s\n"), ControlFile->checkPointCopy.fullPageWrites ? _("on") : _("off")); printf(_("Latest checkpoint's NextXID: %llu\n"), - (unsigned long long) U64FromFullTransactionId(ControlFile->checkPointCopy.nextXid)); + (unsigned long long) XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %llu\n"), diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index dcb3b11d57..bab5773b2e 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -41,6 +41,7 @@ #include "access/attnum.h" #include "access/sysattr.h" #include "access/transam.h" +#include "c.h" #include "catalog/pg_aggregate_d.h" #include "catalog/pg_am_d.h" #include "catalog/pg_attribute_d.h" @@ -2830,7 +2831,7 @@ dumpDatabase(Archive *fout) *datistemplate, *datconnlimit, *tablespace; - uint32 frozenxid, + uint64 frozenxid, minmxid; char *qdatname; @@ -2891,8 +2892,8 @@ dumpDatabase(Archive *fout) iculocale = PQgetvalue(res, 0, i_daticulocale); else iculocale = NULL; - frozenxid = atooid(PQgetvalue(res, 0, i_frozenxid)); - minmxid = atooid(PQgetvalue(res, 0, i_minmxid)); + frozenxid = strtou64(PQgetvalue(res, 0, i_frozenxid), NULL, 0); + minmxid = strtou64(PQgetvalue(res, 0, i_minmxid), NULL, 0); dbdacl.acl = PQgetvalue(res, 0, i_datacl); dbdacl.acldefault = PQgetvalue(res, 0, i_acldefault); datistemplate = PQgetvalue(res, 0, i_datistemplate); @@ -3178,10 +3179,16 @@ dumpDatabase(Archive *fout) RelFileNumber relfilenumber; appendPQExpBuffer(loHorizonQry, "UPDATE pg_catalog.pg_class\n" - "SET relfrozenxid = '%u', relminmxid = '%u'\n" + "SET relfrozenxid = '%llu', relminmxid = '%llu'\n" "WHERE oid = %u;\n", - atooid(PQgetvalue(lo_res, i, ii_relfrozenxid)), - atooid(PQgetvalue(lo_res, i, ii_relminmxid)), + (unsigned long long) strtou64(PQgetvalue(lo_res, + i, + ii_relfrozenxid), + NULL, 0), + (unsigned long long) strtou64(PQgetvalue(lo_res, + i, + ii_relminmxid), + NULL, 0), atooid(PQgetvalue(lo_res, i, ii_oid))); oid = atooid(PQgetvalue(lo_res, i, ii_oid)); @@ -6461,11 +6468,11 @@ getTables(Archive *fout, int *numTables) tblinfo[i].relreplident = *(PQgetvalue(res, i, i_relreplident)); tblinfo[i].rowsec = (strcmp(PQgetvalue(res, i, i_relrowsec), "t") == 0); tblinfo[i].forcerowsec = (strcmp(PQgetvalue(res, i, i_relforcerowsec), "t") == 0); - tblinfo[i].frozenxid = atooid(PQgetvalue(res, i, i_relfrozenxid)); - tblinfo[i].toast_frozenxid = atooid(PQgetvalue(res, i, i_toastfrozenxid)); + tblinfo[i].frozenxid = strtou64(PQgetvalue(res, i, i_relfrozenxid), NULL, 0); + tblinfo[i].toast_frozenxid = strtou64(PQgetvalue(res, i, i_toastfrozenxid), NULL, 0); tblinfo[i].toast_oid = atooid(PQgetvalue(res, i, i_toastoid)); - tblinfo[i].minmxid = atooid(PQgetvalue(res, i, i_relminmxid)); - tblinfo[i].toast_minmxid = atooid(PQgetvalue(res, i, i_toastminmxid)); + tblinfo[i].minmxid = strtou64(PQgetvalue(res, i, i_relminmxid), NULL, 0); + tblinfo[i].toast_minmxid = strtou64(PQgetvalue(res, i, i_toastminmxid), NULL, 0); tblinfo[i].reloptions = pg_strdup(PQgetvalue(res, i, i_reloptions)); if (PQgetisnull(res, i, i_checkoption)) tblinfo[i].checkoption = NULL; diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 427f5d45f6..f47a403b15 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -298,11 +298,11 @@ typedef struct _tableInfo bool rowsec; /* is row security enabled? */ bool forcerowsec; /* is row security forced? */ bool hasoids; /* does it have OIDs? */ - uint32 frozenxid; /* table's relfrozenxid */ - uint32 minmxid; /* table's relminmxid */ + uint64 frozenxid; /* table's relfrozenxid */ + uint64 minmxid; /* table's relminmxid */ Oid toast_oid; /* toast table's OID, or 0 if none */ - uint32 toast_frozenxid; /* toast table's relfrozenxid, if any */ - uint32 toast_minmxid; /* toast table's relminmxid */ + uint64 toast_frozenxid; /* toast table's relfrozenxid, if any */ + uint64 toast_minmxid; /* toast table's relminmxid */ int ncheck; /* # of CHECK expressions */ Oid reltype; /* OID of table's composite type, if any */ Oid reloftype; /* underlying type for typed table */ diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 977491b875..24967f4e71 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -63,7 +63,6 @@ static ControlFileData ControlFile; /* pg_control values */ static XLogSegNo newXlogSegNo; /* new XLOG segment # */ static bool guessed = false; /* T if we had to guess at any values */ static const char *progname; -static uint32 set_xid_epoch = (uint32) -1; static TransactionId set_oldest_xid = 0; static TransactionId set_xid = 0; static TransactionId set_oldest_commit_ts_xid = 0; @@ -95,7 +94,6 @@ main(int argc, char *argv[]) static struct option long_options[] = { {"commit-timestamp-ids", required_argument, NULL, 'c'}, {"pgdata", required_argument, NULL, 'D'}, - {"epoch", required_argument, NULL, 'e'}, {"force", no_argument, NULL, 'f'}, {"next-wal-file", required_argument, NULL, 'l'}, {"multixact-ids", required_argument, NULL, 'm'}, @@ -137,7 +135,7 @@ main(int argc, char *argv[]) } - while ((c = getopt_long(argc, argv, "c:D:e:fl:m:no:O:u:x:", long_options, NULL)) != -1) + while ((c = getopt_long(argc, argv, "c:D:fl:m:no:O:u:x:", long_options, NULL)) != -1) { switch (c) { @@ -153,24 +151,9 @@ main(int argc, char *argv[]) noupdate = true; break; - case 'e': - errno = 0; - set_xid_epoch = strtoul(optarg, &endptr, 0); - if (endptr == optarg || *endptr != '\0' || errno != 0) - { - /*------ - translator: the second %s is a command line argument (-e, etc) */ - pg_log_error("invalid argument for option %s", "-e"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit(1); - } - if (set_xid_epoch == -1) - pg_fatal("transaction ID epoch (-e) must not be -1"); - break; - case 'u': errno = 0; - set_oldest_xid = strtoul(optarg, &endptr, 0); + set_oldest_xid = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-u"); @@ -184,7 +167,7 @@ main(int argc, char *argv[]) case 'x': errno = 0; - set_xid = strtoul(optarg, &endptr, 0); + set_xid = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-x"); @@ -198,14 +181,14 @@ main(int argc, char *argv[]) case 'c': errno = 0; - set_oldest_commit_ts_xid = strtoul(optarg, &endptr, 0); + set_oldest_commit_ts_xid = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',' || errno != 0) { pg_log_error("invalid argument for option %s", "-c"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } - set_newest_commit_ts_xid = strtoul(endptr + 1, &endptr2, 0); + set_newest_commit_ts_xid = strtou64(endptr + 1, &endptr2, 0); if (endptr2 == endptr + 1 || *endptr2 != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-c"); @@ -237,7 +220,7 @@ main(int argc, char *argv[]) case 'm': errno = 0; - set_mxid = strtoul(optarg, &endptr, 0); + set_mxid = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != ',' || errno != 0) { pg_log_error("invalid argument for option %s", "-m"); @@ -245,7 +228,7 @@ main(int argc, char *argv[]) exit(1); } - set_oldestmxid = strtoul(endptr + 1, &endptr2, 0); + set_oldestmxid = strtou64(endptr + 1, &endptr2, 0); if (endptr2 == endptr + 1 || *endptr2 != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-m"); @@ -265,7 +248,7 @@ main(int argc, char *argv[]) case 'O': errno = 0; - set_mxoff = strtoul(optarg, &endptr, 0); + set_mxoff = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); @@ -408,11 +391,6 @@ main(int argc, char *argv[]) * Adjust fields if required by switches. (Do this now so that printout, * if any, includes these values.) */ - if (set_xid_epoch != -1) - ControlFile.checkPointCopy.nextXid = - FullTransactionIdFromEpochAndXid(set_xid_epoch, - XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); - if (set_oldest_xid != 0) { ControlFile.checkPointCopy.oldestXid = set_oldest_xid; @@ -420,9 +398,7 @@ main(int argc, char *argv[]) } if (set_xid != 0) - ControlFile.checkPointCopy.nextXid = - FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid), - set_xid); + ControlFile.checkPointCopy.nextXid = FullTransactionIdFromXid(set_xid); if (set_oldest_commit_ts_xid != 0) ControlFile.checkPointCopy.oldestCommitTsXid = set_oldest_commit_ts_xid; @@ -655,7 +631,7 @@ GuessControlValues(void) ControlFile.checkPointCopy.PrevTimeLineID = 1; ControlFile.checkPointCopy.fullPageWrites = false; ControlFile.checkPointCopy.nextXid = - FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); + FullTransactionIdFromXid(FirstNormalTransactionId); ControlFile.checkPointCopy.nextOid = FirstGenbkiObjectId; ControlFile.checkPointCopy.nextMulti = FirstMultiXactId; ControlFile.checkPointCopy.nextMultiOffset = 0; @@ -706,6 +682,8 @@ GuessControlValues(void) * * NB: this display should be just those fields that will not be * reset by RewriteControlFile(). + * + * Special macros help to make translatable strings. */ static void PrintControlValues(bool guessed) @@ -725,8 +703,7 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.ThisTimeLineID); printf(_("Latest checkpoint's full_page_writes: %s\n"), ControlFile.checkPointCopy.fullPageWrites ? _("on") : _("off")); - printf(_("Latest checkpoint's NextXID: %u:%llu\n"), - EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid), + printf(_("Latest checkpoint's NextXID: %llu\n"), (unsigned long long) XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile.checkPointCopy.nextOid); @@ -824,12 +801,6 @@ PrintNewControlValues(void) ControlFile.checkPointCopy.oldestXidDB); } - if (set_xid_epoch != -1) - { - printf(_("NextXID epoch: %u\n"), - EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid)); - } - if (set_oldest_commit_ts_xid != 0) { printf(_("oldestCommitTsXid: %llu\n"), @@ -1133,7 +1104,6 @@ usage(void) " set oldest and newest transactions bearing\n" " commit timestamp (zero means no change)\n")); printf(_(" [-D, --pgdata=]DATADIR data directory\n")); - printf(_(" -e, --epoch=XIDEPOCH set next transaction ID epoch\n")); printf(_(" -f, --force force update to be done\n")); printf(_(" -l, --next-wal-file=WALFILE set minimum starting location for new WAL\n")); printf(_(" -m, --multixact-ids=MXID,MXID set next and oldest multitransaction ID\n")); diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 7f8042f34a..1d47abd975 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -20,6 +20,7 @@ OBJS = \ parallel.o \ pg_upgrade.o \ relfilenumber.o \ + segresize.o \ server.o \ tablespace.o \ util.o \ diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index f1bc1e6886..3ff1951185 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -33,6 +33,8 @@ static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(ClusterInfo *new_cluster); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); static char *get_canonical_locale_name(int category, const char *locale); +static void check_for_32bit_xid_usage(ClusterInfo *cluster); +static bool is_xid_wraparound(ClusterInfo *cluster); /* @@ -82,7 +84,7 @@ output_check_banner(bool live_check) void -check_and_dump_old_cluster(bool live_check) +check_and_dump_old_cluster(bool live_check, bool *is_wraparound) { /* -- OLD -- */ @@ -168,6 +170,17 @@ check_and_dump_old_cluster(bool live_check) if (GET_MAJOR_VERSION(old_cluster.major_version) <= 903) old_9_3_check_for_line_data_type_usage(&old_cluster); + /* Prepare for 64bit xid */ + if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER) + { + /* Check if 32-bit xid type is used in tables */ + check_for_32bit_xid_usage(&old_cluster); + /* Check indexes to be upgraded */ + invalidate_spgist_indexes(&old_cluster, true); + invalidate_gin_indexes(&old_cluster, true); + invalidate_external_indexes(&old_cluster, true); + } + /* * While not a check option, we do this now because this is the only time * the old server is running. @@ -175,6 +188,8 @@ check_and_dump_old_cluster(bool live_check) if (!user_opts.check) generate_old_dump(); + *is_wraparound = is_xid_wraparound(&old_cluster); + if (!live_check) stop_postmaster(false); } @@ -244,6 +259,17 @@ issue_warnings_and_set_wal_level(void) if (GET_MAJOR_VERSION(old_cluster.major_version) <= 906) old_9_6_invalidate_hash_indexes(&new_cluster, false); + /* Raindex for 64bit xid */ + if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER) + { + /* Check if 32-bit xid type is used in tables */ + check_for_32bit_xid_usage(&old_cluster); + /* Check indexes to be upgraded */ + invalidate_spgist_indexes(&old_cluster, true); + invalidate_gin_indexes(&old_cluster, true); + invalidate_external_indexes(&old_cluster, true); + } + report_extension_updates(&new_cluster); stop_postmaster(false); @@ -1502,3 +1528,124 @@ get_canonical_locale_name(int category, const char *locale) return res; } + +/* + * check_for_32bit_xid_usage() + * + * Postgres Pro Enterprise changes xid storage format to 64-bit. Check if + * xid type is used in tables. + */ +static void +check_for_32bit_xid_usage(ClusterInfo *cluster) +{ + int dbnum; + FILE *script = NULL; + bool found = false; + char output_path[MAXPGPATH]; + + prep_status("Checking for incompatible \"xid\" data type"); + + snprintf(output_path, sizeof(output_path), "tables_using_xid.txt"); + + for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++) + { + PGresult *res; + bool db_used = false; + int ntups; + int rowno; + int i_nspname, + i_relname, + i_attname; + DbInfo *active_db = &cluster->dbarr.dbs[dbnum]; + PGconn *conn = connectToServer(cluster, active_db->db_name); + + /* + * While several relkinds don't store any data, e.g. views, they can + * be used to define data types of other columns, so we check all + * relkinds. + */ + res = executeQueryOrDie(conn, + "SELECT n.nspname, c.relname, a.attname " + "FROM pg_catalog.pg_class c, " + " pg_catalog.pg_namespace n, " + " pg_catalog.pg_attribute a " + "WHERE c.oid = a.attrelid AND " + " a.attnum >= 1 AND " + " a.atttypid = 'pg_catalog.xid'::pg_catalog.regtype AND " + " c.relnamespace = n.oid AND " + /* exclude possible orphaned temp tables */ + " n.nspname !~ '^pg_temp_' AND " + " n.nspname NOT IN ('pg_catalog', 'information_schema')"); + + ntups = PQntuples(res); + i_nspname = PQfnumber(res, "nspname"); + i_relname = PQfnumber(res, "relname"); + i_attname = PQfnumber(res, "attname"); + for (rowno = 0; rowno < ntups; rowno++) + { + found = true; + if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %s\n", + output_path, strerror(errno)); + if (!db_used) + { + fprintf(script, "Database: %s\n", active_db->db_name); + db_used = true; + } + fprintf(script, " %s.%s.%s\n", + PQgetvalue(res, rowno, i_nspname), + PQgetvalue(res, rowno, i_relname), + PQgetvalue(res, rowno, i_attname)); + } + + PQclear(res); + + PQfinish(conn); + } + + if (script) + fclose(script); + + if (found) + { + pg_log(PG_REPORT, "fatal\n"); + pg_fatal("Your installation contains the \"xid\" data type in user tables.\n" + "The internal format of \"xid\" changed in Postgres Pro Enterprise so this cluster\n" + "cannot currently be upgraded. Note that even dropped attributes cause a problem.\n" + "You can remove the problem tables and restart the upgrade.\n" + "A list of the problem columns is in the file:\n" + " %s\n\n", output_path); + } + else + check_ok(); +} + +/* + * is_xid_wraparound() + * + * Return true if 32-xid cluster had wraparound. + */ +static bool +is_xid_wraparound(ClusterInfo *cluster) +{ + PGconn *conn; + PGresult *res; + bool is_wraparound; + + conn = connectToServer(cluster, "template1"); + + /* + * txid_current is extended with an "epoch" counter, so to check + * wraparound in old 32-xid cluster we cut epoch by casting to int4. + */ + res = executeQueryOrDie(conn, + "SELECT 1 " + "FROM pg_catalog.pg_database, txid_current() tx " + "WHERE (tx %% 4294967295)::bigint <= datfrozenxid::text::bigint " + "LIMIT 1"); + is_wraparound = PQntuples(res) ? true : false; + PQclear(res); + PQfinish(conn); + + return is_wraparound; +} diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index a5b4a77570..5021f0112b 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -284,6 +284,8 @@ get_control_data(ClusterInfo *cluster, bool live_check) xid.value = strtou64(p, NULL, 10); /* + * Try to read 32-bit XID format 'epoch:xid'. + * * Delimiter changed from '/' to ':' in 9.6. We don't test for * the catalog version of the change because the catalog version * is pulled from pg_controldata too, and it isn't worth adding an @@ -299,8 +301,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) if (p == NULL) { /* FullTransactionId representation */ - cluster->controldata.chkpnt_nxtxid = XidFromFullTransactionId(xid); - cluster->controldata.chkpnt_nxtepoch = EpochFromFullTransactionId(xid); + cluster->controldata.chkpnt_nxtxid = xid.value; } else { @@ -309,8 +310,8 @@ get_control_data(ClusterInfo *cluster, bool live_check) /* Epoch:Xid representation */ p++; /* remove '/' or ':' char */ - cluster->controldata.chkpnt_nxtxid = str2uint(p); - cluster->controldata.chkpnt_nxtepoch = (TransactionId) XidFromFullTransactionId(xid); + cluster->controldata.chkpnt_nxtxid = (XidFromFullTransactionId(xid)) << 32 | + (TransactionId) str2uint(p); } got_xid = true; @@ -334,7 +335,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) pg_fatal("%d: controldata retrieval problem", __LINE__); p++; /* remove ':' char */ - cluster->controldata.chkpnt_nxtmulti = str2uint(p); + cluster->controldata.chkpnt_nxtmulti = strtou64(p, NULL, 10); got_multi = true; } else if ((p = strstr(bufin, "Latest checkpoint's oldestXID:")) != NULL) @@ -345,7 +346,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) pg_fatal("%d: controldata retrieval problem", __LINE__); p++; /* remove ':' char */ - cluster->controldata.chkpnt_oldstxid = str2uint(p); + cluster->controldata.chkpnt_oldstxid = strtou64(p, NULL, 10); got_oldestxid = true; } else if ((p = strstr(bufin, "Latest checkpoint's oldestMultiXid:")) != NULL) @@ -356,7 +357,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) pg_fatal("%d: controldata retrieval problem", __LINE__); p++; /* remove ':' char */ - cluster->controldata.chkpnt_oldstMulti = str2uint(p); + cluster->controldata.chkpnt_oldstMulti = strtou64(p, NULL, 10); got_oldestmulti = true; } else if ((p = strstr(bufin, "Latest checkpoint's NextMultiOffset:")) != NULL) @@ -367,7 +368,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) pg_fatal("%d: controldata retrieval problem", __LINE__); p++; /* remove ':' char */ - cluster->controldata.chkpnt_nxtmxoff = str2uint(p); + cluster->controldata.chkpnt_nxtmxoff = strtou64(p, NULL, 10); got_mxoff = true; } else if ((p = strstr(bufin, "First log segment after reset:")) != NULL) diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 079fbda838..dedd8ad2b7 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -174,7 +174,8 @@ linkFile(const char *src, const char *dst, */ void rewriteVisibilityMap(const char *fromfile, const char *tofile, - const char *schemaName, const char *relName) + const char *schemaName, const char *relName, + bool update_version) { int src_fd; int dst_fd; @@ -290,6 +291,11 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, if (old_lastpart && empty) break; + if (update_version) + PageSetPageSizeAndVersion((Page) new_vmbuf.data, + PageGetPageSize((Page) new_vmbuf.data), + PG_PAGE_LAYOUT_VERSION); + /* Set new checksum for visibility map page, if enabled */ if (new_cluster.controldata.data_checksum_version != 0) ((PageHeader) new_vmbuf.data)->pd_checksum = @@ -316,6 +322,97 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } +/* + * updateSegmentVersion() + * + * Transform a segment file, copying from src to dst. + * schemaName/relName are relation's SQL name (used for error messages only). + * + * Read segment pages one by one and set version to PG_PAGE_LAYOUT_VERSION. + * + * Although FSM and MV formats does not change while switch to 64-bit XIDs, we + * must upgrade pages version in order to avoid lazy conversion on first read. + */ +void +updateSegmentPagesVersion(const char *fromfile, const char *tofile, + const char *schemaName, const char *relName) +{ + int src_fd; + int dst_fd; + struct stat statbuf; + ssize_t src_filesize; + ssize_t totalBytesRead; + ssize_t bytesRead; + BlockNumber blkno; + PGAlignedBlock buf; + + if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + + if (fstat(src_fd, &statbuf) != 0) + pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", + schemaName, relName, tofile, strerror(errno)); + + /* Save old file size */ + src_filesize = statbuf.st_size; + totalBytesRead = 0; + blkno = 0; + + while (totalBytesRead < src_filesize) + { + errno = 0; + if ((bytesRead = read(src_fd, buf.data, BLCKSZ)) != BLCKSZ) + { + if (bytesRead < 0) + pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + else + pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"", + schemaName, relName, fromfile); + } + + totalBytesRead += BLCKSZ; + PageSetPageSizeAndVersion((Page) buf.data, + PageGetPageSize((Page) buf.data), + PG_PAGE_LAYOUT_VERSION); + + /* Set new checksum for page, if enabled */ + if (new_cluster.controldata.data_checksum_version != 0) + ((PageHeader) buf.data)->pd_checksum = + pg_checksum_page(buf.data, blkno); + + /* + * We dealing here only with FSM and VM pages. + */ + if (((PageHeader) buf.data)->pd_lower != SizeOfPageHeaderData || + ((PageHeader) buf.data)->pd_upper != BLCKSZ) + pg_fatal("error while copying relation \"%s.%s\": unknown page format found in file \"%s\"", + schemaName, relName, fromfile); + + errno = 0; + if (write(dst_fd, buf.data, BLCKSZ) != BLCKSZ) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", + schemaName, relName, tofile, strerror(errno)); + } + + blkno++; + } + + /* Clean up */ + close(dst_fd); + close(src_fd); +} + void check_file_clone(void) { diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index 02f030e0cc..8822377e07 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -10,6 +10,7 @@ pg_upgrade_sources = files( 'parallel.c', 'pg_upgrade.c', 'relfilenumber.c', + 'segresize.c', 'server.c', 'tablespace.c', 'util.c', diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index a8b1e0ed84..326dc9c8d5 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -44,6 +44,9 @@ #include #endif +#include "access/multixact.h" +#include "access/transam.h" +#include "access/xlog_internal.h" #include "catalog/pg_class_d.h" #include "common/file_perm.h" #include "common/logging.h" @@ -53,7 +56,7 @@ static void prepare_new_cluster(void); static void prepare_new_globals(void); -static void create_new_objects(void); +static void create_new_objects(bool is_wraparound); static void copy_xact_xlog_xid(void); static void set_frozenxids(bool minmxid_only); static void make_outputdirs(char *pgdata); @@ -80,6 +83,7 @@ main(int argc, char **argv) { char *deletion_script_file_name = NULL; bool live_check = false; + bool is_wraparound = false; /* * pg_upgrade doesn't currently use common/logging.c, but initialize it @@ -125,7 +129,7 @@ main(int argc, char **argv) check_cluster_compatibility(live_check); - check_and_dump_old_cluster(live_check); + check_and_dump_old_cluster(live_check, &is_wraparound); /* -- NEW -- */ @@ -156,7 +160,7 @@ main(int argc, char **argv) prepare_new_globals(); - create_new_objects(); + create_new_objects(is_wraparound); stop_postmaster(false); @@ -365,7 +369,6 @@ setup(char *argv0, bool *live_check) } } - static void prepare_new_cluster(void) { @@ -419,7 +422,7 @@ prepare_new_globals(void) static void -create_new_objects(void) +create_new_objects(bool is_wraparound) { int dbnum; @@ -511,11 +514,23 @@ create_new_objects(void) check_ok(); /* - * We don't have minmxids for databases or relations in pre-9.3 clusters, - * so set those after we have restored the schema. + * Refix datfrozenxid and datminmxid */ if (GET_MAJOR_VERSION(old_cluster.major_version) <= 902) set_frozenxids(true); + else if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= XID_FORMATCHANGE_CAT_VER) + { + /* + * During upgrade from 32-bit to 64-bit xids save relfrozenxids if + * there was no wraparound in old cluster. Otherwise, reset them to + * FirstNormalTransactionId value. + */ + if (is_wraparound) + set_frozenxids(false); + else + set_frozenxids(true); + } /* update new_cluster info now that we have objects in the databases */ get_db_and_rel_infos(&new_cluster); @@ -569,14 +584,37 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir) static void copy_xact_xlog_xid(void) { - /* - * Copy old commit logs to new data dir. pg_clog has been renamed to - * pg_xact in post-10 clusters. - */ - copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? - "pg_clog" : "pg_xact", - GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? - "pg_clog" : "pg_xact"); + TransactionId next_xid; + +#define GetClogDirName(cluster) \ + GET_MAJOR_VERSION(cluster.major_version) <= 906 ? "pg_clog" : "pg_xact" + + /* Set next xid to 2^32 if we're upgrading from 32 bit postgres */ + if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= XID_FORMATCHANGE_CAT_VER) + next_xid = ((TransactionId) 1 << 32); + else + next_xid = old_cluster.controldata.chkpnt_nxtxid; + + if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= XID_FORMATCHANGE_CAT_VER) + { + /* Convert commit logs and copy to the new data dir */ + prep_status("Transforming commit log segments"); + convert_xact(psprintf("%s/%s", old_cluster.pgdata, GetClogDirName(old_cluster)), + psprintf("%s/%s", new_cluster.pgdata, GetClogDirName(new_cluster))); + check_ok(); + } + else + { + /* + * Copy old commit logs to new data dir. pg_clog has been renamed to + * pg_xact in post-10 clusters. + */ + prep_status("Copying commit log segments"); + copy_subdir_files(GetClogDirName(old_cluster), GetClogDirName(new_cluster)); + check_ok(); + } prep_status("Setting oldest XID for new cluster"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, @@ -590,19 +628,20 @@ copy_xact_xlog_xid(void) prep_status("Setting next transaction ID and epoch for new cluster"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -x %llu \"%s\"", - new_cluster.bindir, - (unsigned long long) old_cluster.controldata.chkpnt_nxtxid, + new_cluster.bindir, (unsigned long long) next_xid, new_cluster.pgdata); +#ifdef NOT_USED exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -e %u \"%s\"", new_cluster.bindir, old_cluster.controldata.chkpnt_nxtepoch, new_cluster.pgdata); +#endif /* must reset commit timestamp limits also */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -c %llu,%llu \"%s\"", new_cluster.bindir, - (unsigned long long) old_cluster.controldata.chkpnt_nxtxid, - (unsigned long long) old_cluster.controldata.chkpnt_nxtxid, + (unsigned long long) next_xid, + (unsigned long long) next_xid, new_cluster.pgdata); check_ok(); @@ -615,8 +654,48 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); - copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + uint64 oldest_mxid = old_cluster.controldata.chkpnt_oldstMulti; + uint64 next_mxid = old_cluster.controldata.chkpnt_nxtmulti; + uint64 next_mxoff = old_cluster.controldata.chkpnt_nxtmxoff; + + if (old_cluster.controldata.cat_ver >= XID_FORMATCHANGE_CAT_VER) + { + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); + } + else + { + MultiXactOffset oldest_mxoff; + + remove_new_subdir("pg_multixact/offsets", false); + oldest_mxoff = convert_multixact_offsets("pg_multixact/offsets", "pg_multixact/offsets"); + + remove_new_subdir("pg_multixact/members", false); + convert_multixact_members("pg_multixact/members", "pg_multixact/members", oldest_mxoff); + + /* + * Handle wraparound if we're upgrading from 32 bit postgres. + * Invalid 0 mxids/offsets are skipped, so 1 becomes 2^32. + */ + if (oldest_mxoff) + { + if (next_mxid < oldest_mxid) + next_mxid += ((MultiXactId) 1 << 32) - FirstMultiXactId; + + if (next_mxoff < oldest_mxoff) + next_mxoff += ((MultiXactOffset) 1 << 32) - 1; + + /* Offsets and members were rewritten, oldest_mxoff = 1 */ + next_mxoff -= oldest_mxoff - 1; + oldest_mxoff = 1; + + /* + * Save converted next_mxid for possible usage in + * set_frozenxids() + */ + old_cluster.controldata.chkpnt_nxtmulti = next_mxid; + } + } prep_status("Setting next multixact ID and offset for new cluster"); @@ -627,9 +706,9 @@ copy_xact_xlog_xid(void) exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -O %llu -m %llu,%llu \"%s\"", new_cluster.bindir, - (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff, - (unsigned long long) old_cluster.controldata.chkpnt_nxtmulti, - (unsigned long long) old_cluster.controldata.chkpnt_oldstMulti, + (unsigned long long) next_mxoff, + (unsigned long long) next_mxid, + (unsigned long long) oldest_mxid, new_cluster.pgdata); check_ok(); } @@ -703,6 +782,8 @@ set_frozenxids(bool minmxid_only) int ntups; int i_datname; int i_datallowconn; + TransactionId frozen_xid; + MultiXactId minmxid; if (!minmxid_only) prep_status("Setting frozenxid and minmxid counters in new cluster"); @@ -711,18 +792,26 @@ set_frozenxids(bool minmxid_only) conn_template1 = connectToServer(&new_cluster, "template1"); + if (old_cluster.controldata.cat_ver < XID_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= XID_FORMATCHANGE_CAT_VER) + frozen_xid = FirstNormalTransactionId; + else + frozen_xid = old_cluster.controldata.chkpnt_nxtxid; + + minmxid = old_cluster.controldata.chkpnt_nxtmulti; + if (!minmxid_only) /* set pg_database.datfrozenxid */ PQclear(executeQueryOrDie(conn_template1, "UPDATE pg_catalog.pg_database " "SET datfrozenxid = '%llu'", - (unsigned long long) old_cluster.controldata.chkpnt_nxtxid)); + (unsigned long long) frozen_xid)); /* set pg_database.datminmxid */ PQclear(executeQueryOrDie(conn_template1, "UPDATE pg_catalog.pg_database " "SET datminmxid = '%llu'", - (unsigned long long) old_cluster.controldata.chkpnt_nxtmulti)); + (unsigned long long) minmxid)); /* get database names */ dbres = executeQueryOrDie(conn_template1, @@ -762,7 +851,7 @@ set_frozenxids(bool minmxid_only) CppAsString2(RELKIND_RELATION) ", " CppAsString2(RELKIND_MATVIEW) ", " CppAsString2(RELKIND_TOASTVALUE) ")", - (unsigned long long) old_cluster.controldata.chkpnt_nxtxid)); + (unsigned long long) frozen_xid)); /* set pg_class.relminmxid */ PQclear(executeQueryOrDie(conn, @@ -773,7 +862,7 @@ set_frozenxids(bool minmxid_only) CppAsString2(RELKIND_RELATION) ", " CppAsString2(RELKIND_MATVIEW) ", " CppAsString2(RELKIND_TOASTVALUE) ")", - (unsigned long long) old_cluster.controldata.chkpnt_nxtmulti)); + (unsigned long long) minmxid)); PQfinish(conn); /* Reset datallowconn flag */ diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 31589b0fdc..538994d5e5 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -115,6 +115,11 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * xid format changed from 32-bit to 64-bit. + */ +#define XID_FORMATCHANGE_CAT_VER 999999999 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 @@ -199,13 +204,13 @@ typedef struct uint32 ctrl_ver; uint32 cat_ver; char nextxlogfile[25]; - uint32 chkpnt_nxtxid; - uint32 chkpnt_nxtepoch; + uint64 chkpnt_nxtxid; + uint32 chkpnt_nxtepoch; /* for 32bit xids only */ uint32 chkpnt_nxtoid; - uint32 chkpnt_nxtmulti; - uint32 chkpnt_nxtmxoff; - uint32 chkpnt_oldstMulti; - uint32 chkpnt_oldstxid; + uint64 chkpnt_nxtmulti; + uint64 chkpnt_nxtmxoff; + uint64 chkpnt_oldstMulti; + uint64 chkpnt_oldstxid; uint32 align; uint32 blocksz; uint32 largesz; @@ -333,7 +338,7 @@ extern OSInfo os_info; /* check.c */ void output_check_banner(bool live_check); -void check_and_dump_old_cluster(bool live_check); +void check_and_dump_old_cluster(bool live_check, bool *is_wraparound); void check_new_cluster(void); void report_clusters_compatible(void); void issue_warnings_and_set_wal_level(void); @@ -374,7 +379,10 @@ void copyFile(const char *src, const char *dst, void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, - const char *schemaName, const char *relName); + const char *schemaName, const char *relName, + bool update_version); +void updateSegmentPagesVersion(const char *fromfile, const char *tofile, + const char *schemaName, const char *relName); void check_file_clone(void); void check_hard_link(void); @@ -456,6 +464,10 @@ void old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, void old_11_check_for_sql_identifier_data_type_usage(ClusterInfo *cluster); void report_extension_updates(ClusterInfo *cluster); +void invalidate_spgist_indexes(ClusterInfo *cluster, bool check_mode); +void invalidate_gin_indexes(ClusterInfo *cluster, bool check_mode); +void invalidate_external_indexes(ClusterInfo *cluster, bool check_mode); + /* parallel.c */ void parallel_exec_prog(const char *log_file, const char *opt_log_file, const char *fmt,...) pg_attribute_printf(3, 4); @@ -463,3 +475,9 @@ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr char *old_pgdata, char *new_pgdata, char *old_tablespace); bool reap_child(bool wait_for_child); + +/* segresize.c */ +void convert_xact(const char *olddir, const char *newdir); +MultiXactOffset convert_multixact_offsets(const char *olddir, const char *newdir); +void convert_multixact_members(const char *olddir, const char *newdir, + MultiXactOffset oldest_mxoff); diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index c3f3d6bc0a..678e74051e 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -16,7 +16,8 @@ #include "pg_upgrade.h" static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); +static void transfer_relfile(FileNameMap *map, const char *type_suffix, + bool vm_must_add_frozenbit, bool update_version); /* @@ -136,6 +137,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) { int mapnum; bool vm_must_add_frozenbit = false; + bool update_version = false; /* * Do we need to rewrite visibilitymap? @@ -144,19 +146,28 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER) vm_must_add_frozenbit = true; + /* + * Need to update FSM and VM pages version to avoid lazy conversion. + */ + if (old_cluster.controldata.cat_ver < new_cluster.controldata.cat_ver) + update_version = true; + for (mapnum = 0; mapnum < size; mapnum++) { if (old_tablespace == NULL || strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0) { /* transfer primary file */ - transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit); + transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit, + update_version); /* * Copy/link any fsm and vm files, if they exist */ - transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit); - transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit); + transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit, + update_version); + transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit, + update_version); } } } @@ -170,7 +181,8 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) * mode. */ static void -transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) +transfer_relfile(FileNameMap *map, const char *type_suffix, + bool vm_must_add_frozenbit, bool update_version) { char old_file[MAXPGPATH]; char new_file[MAXPGPATH]; @@ -235,7 +247,17 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro /* Need to rewrite visibility map format */ pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"", old_file, new_file); - rewriteVisibilityMap(old_file, new_file, map->nspname, map->relname); + rewriteVisibilityMap(old_file, new_file, map->nspname, + map->relname, update_version); + } + else if ((update_version && strcmp(type_suffix, "_vm") == 0) || + (update_version && strcmp(type_suffix, "_fsm") == 0)) + { + /* Need to update pages version */ + pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"", + old_file, new_file); + updateSegmentPagesVersion(old_file, new_file, map->nspname, + map->relname); } else switch (user_opts.transfer_mode) diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c new file mode 100644 index 0000000000..99e2c5ecde --- /dev/null +++ b/src/bin/pg_upgrade/segresize.c @@ -0,0 +1,586 @@ +/*------------------------------------------------------------------------- + * + * segresize.c + * SLRU segment resize utility from 32bit to 64bit xid format + * + * Copyright (c) 2015-2022, Postgres Professional + * + * IDENTIFICATION + * src/bin/pg_upgrade/segresize.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include "pg_upgrade.h" +#include "access/multixact.h" +#include "access/transam.h" + +#define SLRU_PAGES_PER_SEGMENT_OLD 32 +#define SLRU_PAGES_PER_SEGMENT 32 /* Should be equal to value from slru.h */ + +#define CLOG_BITS_PER_XACT 2 +#define CLOG_XACTS_PER_BYTE 4 +#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) + +typedef uint32 MultiXactId32; +typedef uint32 MultiXactOffset32; +typedef uint32 TransactionId32; + +#define MaxTransactionId32 ((TransactionId32) 0xFFFFFFFF) +#define MaxMultiXactId32 ((MultiXactId32) 0xFFFFFFFF) +#define MaxMultiXactOffset32 ((MultiXactOffset32) 0xFFFFFFFF) + +#define MULTIXACT_OFFSETS_PER_PAGE_OLD (BLCKSZ / sizeof(MultiXactOffset32)) +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 + +/* 64xid */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 8 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) + +/* 32xid */ +#define MULTIXACT_FLAGBYTES_PER_GROUP_OLD 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD \ + (MULTIXACT_FLAGBYTES_PER_GROUP_OLD * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE_OLD \ + (sizeof(TransactionId32) * MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD + MULTIXACT_FLAGBYTES_PER_GROUP_OLD) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE_OLD (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE_OLD) +#define MULTIXACT_MEMBERS_PER_PAGE_OLD \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE_OLD * MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD) + +typedef struct SLRUSegmentState +{ + const char *dir; + FILE *file; + int64 segno; + int64 pageno; + bool is_empty_segment; +} SLRUSegmentState; + +static char * +slru_filename_old(const char *path, int64 segno) +{ + Assert(segno <= PG_INT32_MAX); + return psprintf("%s/%04X", path, (int) segno); +} + +static char * +slru_filename_new(const char *path, int64 segno) +{ + return psprintf("%s/%012llX", path, (long long) segno); +} + +static inline FILE * +open_file(SLRUSegmentState *state, + char * (filename_fn)(const char *path, int64 segno), + char *mode, char *fatal_msg) +{ + char *filename = filename_fn(state->dir, state->segno); + FILE *fd = fopen(filename, mode); + + if (!fd) + pg_fatal(fatal_msg, filename); + + pfree(filename); + + return fd; +} + +static void +close_file(SLRUSegmentState *state, + char * (filename_fn)(const char *path, int64 segno)) +{ + if (state->file != NULL) + { + if (fclose(state->file) != 0) + pg_fatal("could not close file \"%s\": %m", + filename_fn(state->dir, state->segno)); + state->file = NULL; + } +} + +static inline int +read_file(SLRUSegmentState *state, void *buf) +{ + size_t n = fread(buf, sizeof(char), BLCKSZ, state->file); + + if (n != 0) + return n; + + if (ferror(state->file)) + pg_fatal("could not read file \"%s\": %m", + slru_filename_old(state->dir, state->segno)); + + if (!feof(state->file)) + pg_fatal("unknown file read state \"%s\": %m", + slru_filename_old(state->dir, state->segno)); + + close_file(state, slru_filename_old); + + return 0; +} + +static int +read_old_segment_page(SLRUSegmentState *state, void *buf, bool *is_empty) +{ + int n; + + /* Open next segment file, if needed */ + if (!state->file) + { + state->file = open_file(state, slru_filename_old, "rb", + "could not open source file \"%s\": %m"); + + /* Set position to the needed page */ + if (fseek(state->file, state->pageno * BLCKSZ, SEEK_SET)) + close_file(state, slru_filename_old); + + /* + * Skip segment conversion if segment file doesn't exist. + * First segment file should exist in any case. + */ + if (state->segno != 0) + state->is_empty_segment = true; + } + + if (state->file) + { + /* Segment file does exist, read page from it */ + state->is_empty_segment = false; + + /* Try to read BLCKSZ bytes */ + n = read_file(state, buf); + *is_empty = (n == 0); + + /* Zeroing buf tail if needed */ + if (n) + memset((char *) buf + n, 0, BLCKSZ - n); + } + else + { + n = state->is_empty_segment ? + BLCKSZ : /* Skip empty block at the end of segment */ + 0; /* We reached the last segment */ + *is_empty = true; + + if (n) + memset((char *) buf, 0, BLCKSZ); + } + + state->pageno++; + + if (state->pageno >= SLRU_PAGES_PER_SEGMENT_OLD) + { + /* Start new segment */ + state->segno++; + state->pageno = 0; + close_file(state, slru_filename_old); + } + + return n; +} + +static void +write_new_segment_page(SLRUSegmentState *state, void *buf, bool is_empty) +{ + /* + * Create a new segment file if we still didn't. Creation is postponed + * until the first non-empty page is found. This helps not to create + * completely empty segments. + */ + if (!state->file && !is_empty) + { + state->file = open_file(state, slru_filename_new, "wb", + "could not open target file \"%s\": %m"); + + /* Write zeroes to the previously skipped prefix */ + if (state->pageno > 0) + { + char zerobuf[BLCKSZ] = {0}; + + for (int64 i = 0; i < state->pageno; i++) + { + if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", + slru_filename_new(state->dir, state->segno)); + } + } + + } + + /* Write page to the new segment (if it was created) */ + if (state->file) + { + if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", + slru_filename_new(state->dir, state->segno)); + } + + state->pageno++; + + /* + * Did we reach the maximum page number? Then close segment file and + * create a new one on the next iteration + */ + if (state->pageno >= SLRU_PAGES_PER_SEGMENT) + { + state->segno++; + state->pageno = 0; + close_file(state, slru_filename_new); + } +} + +/* + * Convert pg_xact segments. + */ +void +convert_xact(const char *old_subdir, const char *new_subdir) +{ + SLRUSegmentState oldseg = {0}; + SLRUSegmentState newseg = {0}; + TransactionId oldest_xid = old_cluster.controldata.chkpnt_oldstxid; + TransactionId next_xid = old_cluster.controldata.chkpnt_nxtxid; + TransactionId xid; + int64 pageno; + char buf[BLCKSZ] = {0}; + + oldseg.dir = old_subdir; + newseg.dir = new_subdir; + + pageno = oldest_xid / CLOG_XACTS_PER_PAGE; + + oldseg.segno = pageno / SLRU_PAGES_PER_SEGMENT_OLD; + oldseg.pageno = pageno % SLRU_PAGES_PER_SEGMENT_OLD; + + newseg.segno = pageno / SLRU_PAGES_PER_SEGMENT; + newseg.pageno = pageno % SLRU_PAGES_PER_SEGMENT; + + if (next_xid < oldest_xid) + next_xid += (TransactionId) 1 << 32; /* wraparound */ + + /* Copy xid flags reading only needed segment pages */ + for (xid = oldest_xid & ~(CLOG_XACTS_PER_PAGE - 1); + xid <= ((next_xid - 1) & ~(CLOG_XACTS_PER_PAGE - 1)); + xid += CLOG_XACTS_PER_PAGE) + { + bool is_empty; + + /* Handle possible segment wraparound */ + if (oldseg.segno > MaxTransactionId32 / CLOG_XACTS_PER_PAGE / SLRU_PAGES_PER_SEGMENT_OLD) + { + pageno = (MaxTransactionId32 + 1) / CLOG_XACTS_PER_PAGE; + + Assert(oldseg.segno == pageno / SLRU_PAGES_PER_SEGMENT_OLD); + Assert(!oldseg.pageno); + Assert(!oldseg.file); + oldseg.segno = 0; + + Assert(newseg.segno == pageno / SLRU_PAGES_PER_SEGMENT); + Assert(!newseg.pageno); + Assert(!newseg.file); + newseg.segno = 0; + } + + read_old_segment_page(&oldseg, buf, &is_empty); + write_new_segment_page(&newseg, buf, is_empty); + } + + /* Release resources */ + close_file(&oldseg, slru_filename_old); + close_file(&newseg, slru_filename_new); +} + +static inline SLRUSegmentState +create_slru_segment_state(MultiXactId mxid, + int offsets_per_page, + int pages_per_segment, + char *dir) +{ + SLRUSegmentState seg = {0}; + int64 n; + + n = mxid / offsets_per_page; + seg.pageno = n % pages_per_segment; + seg.segno = n / pages_per_segment; + seg.dir = dir; + + return seg; +} + +/* + * Convert pg_multixact/offsets segments and return oldest mxid offset. + */ +MultiXactOffset +convert_multixact_offsets(const char *old_subdir, const char *new_subdir) +{ + SLRUSegmentState oldseg, + newseg; + MultiXactOffset32 oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0}; + MultiXactOffset newbuf[MULTIXACT_OFFSETS_PER_PAGE] = {0}; + MultiXactOffset32 oldest_mxoff = 0; + MultiXactId oldest_mxid, + next_mxid, + mxid; + uint64 old_entry, + new_entry; + bool oldest_mxoff_known = false; + + StaticAssertStmt((sizeof(oldbuf) == BLCKSZ && sizeof(newbuf) == BLCKSZ), + "buf should be BLCKSZ"); + + oldest_mxid = old_cluster.controldata.chkpnt_oldstMulti; + + oldseg = create_slru_segment_state(oldest_mxid, + MULTIXACT_OFFSETS_PER_PAGE_OLD, + SLRU_PAGES_PER_SEGMENT_OLD, + psprintf("%s/%s", old_cluster.pgdata, + old_subdir)); + + newseg = create_slru_segment_state(oldest_mxid, + MULTIXACT_OFFSETS_PER_PAGE, + SLRU_PAGES_PER_SEGMENT, + psprintf("%s/%s", new_cluster.pgdata, + new_subdir)); + + old_entry = oldest_mxid % MULTIXACT_OFFSETS_PER_PAGE_OLD; + new_entry = oldest_mxid % MULTIXACT_OFFSETS_PER_PAGE; + + next_mxid = old_cluster.controldata.chkpnt_nxtmulti; + if (next_mxid < oldest_mxid) + next_mxid += (MultiXactId) 1 << 32; /* wraparound */ + + prep_status("Converting old %s to new format", old_subdir); + + /* Copy mxid offsets reading only needed segment pages */ + for (mxid = oldest_mxid; mxid < next_mxid; old_entry = 0) + { + int oldlen; + bool is_empty; + + /* Handle possible segment wraparound */ + if (oldseg.segno > MaxMultiXactId32 / MULTIXACT_OFFSETS_PER_PAGE_OLD / SLRU_PAGES_PER_SEGMENT_OLD) /* 0xFFFF */ + oldseg.segno = 0; + + oldlen = read_old_segment_page(&oldseg, oldbuf, &is_empty); + + if (oldlen == 0 || is_empty) + pg_fatal("cannot read page %lld from segment: %s\n", + (long long) oldseg.pageno, + slru_filename_old(oldseg.dir, oldseg.segno)); + + /* Save oldest mxid offset */ + if (!oldest_mxoff_known) + { + oldest_mxoff = oldbuf[old_entry]; + oldest_mxoff_known = true; + } + + /* Skip wrapped-around invalid MultiXactIds */ + if (mxid == (MultiXactId) 1 << 32) + { + Assert(oldseg.segno == 0); + Assert(oldseg.pageno == 1); + Assert(old_entry == 0); + mxid += FirstMultiXactId; + old_entry = FirstMultiXactId; + } + + /* Copy entries to the new page */ + for (; mxid < next_mxid && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD; + mxid++, old_entry++) + { + MultiXactOffset mxoff = oldbuf[old_entry]; + + /* Handle possible offset wraparound (1 becomes 2^32) */ + if (mxoff < oldest_mxoff) + mxoff += ((MultiXactOffset) 1 << 32) - 1; + + /* Subtract oldest_mxoff, so new offsets will start from 1 */ + newbuf[new_entry++] = mxoff - oldest_mxoff + 1; + + if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE) + { + /* Write new page */ + write_new_segment_page(&newseg, newbuf, false); + new_entry = 0; + } + } + } + + /* Write the last incomplete page */ + if (new_entry > 0 || oldest_mxid == next_mxid) + { + memset(&newbuf[new_entry], 0, + sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE - new_entry)); + write_new_segment_page(&newseg, newbuf, false); + } + + /* Use next_mxoff as oldest_mxoff, if oldest_mxid == next_mxid */ + if (!oldest_mxoff_known) + { + Assert(oldest_mxid == next_mxid); + oldest_mxoff = (MultiXactOffset) old_cluster.controldata.chkpnt_nxtmxoff; + } + + /* Release resources */ + close_file(&oldseg, slru_filename_old); + close_file(&newseg, slru_filename_new); + + pfree((char *) oldseg.dir); + pfree((char *) newseg.dir); + + check_ok(); + + return oldest_mxoff; +} + +/* + * Convert pg_multixact/members segments, offsets will start from 1. + */ +void +convert_multixact_members(const char *old_subdir, const char *new_subdir, + MultiXactOffset oldest_mxoff) +{ + MultiXactOffset next_mxoff, + mxoff; + SLRUSegmentState oldseg, + newseg; + char oldbuf[BLCKSZ] = {0}, + newbuf[BLCKSZ] = {0}; + int newgroup, + newmember; + char *newflag = newbuf; + TransactionId *newxid; + int oldidx, + newidx; + + prep_status("Converting old %s to new format", old_subdir); + + next_mxoff = (MultiXactOffset) old_cluster.controldata.chkpnt_nxtmxoff; + if (next_mxoff < oldest_mxoff) + next_mxoff += (MultiXactOffset) 1 << 32; + + newxid = (TransactionId *) (newflag + MXACT_MEMBER_FLAGS_PER_BYTE * MULTIXACT_MEMBERS_PER_MEMBERGROUP); + + /* Initialize old starting position */ + oldidx = oldest_mxoff % MULTIXACT_MEMBERS_PER_PAGE_OLD; + oldseg = create_slru_segment_state(oldest_mxoff, + MULTIXACT_MEMBERS_PER_PAGE_OLD, + SLRU_PAGES_PER_SEGMENT_OLD, + psprintf("%s/%s", old_cluster.pgdata, + old_subdir)); + + /* Initialize empty new segment */ + newseg = create_slru_segment_state(0, 1, 1, + psprintf("%s/%s", new_cluster.pgdata, + new_subdir)); + + /* Initialize new starting position (skip invalid zero offset) */ + newgroup = 0; + newidx = 1; + newmember = 1; + newflag++; + newxid++; + + /* Iterate through the original directory */ + for (mxoff = oldest_mxoff; mxoff < next_mxoff; oldidx = 0) + { + bool old_is_empty; + int oldlen; + int ngroups; + int oldgroup; + int oldmember; + + oldlen = read_old_segment_page(&oldseg, oldbuf, &old_is_empty); + + if (oldlen == 0 || old_is_empty) + pg_fatal("cannot read page %lld from segment: %s\n", + (long long) oldseg.pageno, + slru_filename_old(oldseg.dir, oldseg.segno)); + + ngroups = oldlen / MULTIXACT_MEMBERGROUP_SIZE_OLD; + + /* Iterate through old member groups */ + for (oldgroup = oldidx / MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD, + oldmember = oldidx % MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD; + oldgroup < ngroups && mxoff < next_mxoff; + oldgroup++, oldmember = 0) + { + char *oldflag = (char *) oldbuf + oldgroup * MULTIXACT_MEMBERGROUP_SIZE_OLD; + TransactionId32 *oldxid = (TransactionId32 *) (oldflag + MULTIXACT_FLAGBYTES_PER_GROUP_OLD); + + oldxid += oldmember; + oldflag += oldmember; + + /* Iterate through old members */ + for (int i = 0; + i < MULTIXACT_MEMBERS_PER_MEMBERGROUP_OLD && mxoff < next_mxoff; + i++) + { + /* Copy member's xid and flags to the new page */ + *newflag++ = *oldflag++; + *newxid++ = (TransactionId) * oldxid++; + + newidx++; + oldidx++; + mxoff++; + + if (++newmember >= MULTIXACT_MEMBERS_PER_MEMBERGROUP) + { + /* Start next member group */ + newmember = 0; + + if (++newgroup >= MULTIXACT_MEMBERGROUPS_PER_PAGE) + { + /* Write current page and start new */ + newgroup = 0; + newidx = 0; + write_new_segment_page(&newseg, newbuf, false); + memset(newbuf, 0, BLCKSZ); + } + + newflag = (char *) newbuf + newgroup * MULTIXACT_MEMBERGROUP_SIZE; + newxid = (TransactionId *) (newflag + MXACT_MEMBER_FLAGS_PER_BYTE * MULTIXACT_MEMBERS_PER_MEMBERGROUP); + } + + /* Handle offset wraparound */ + if (mxoff > MaxMultiXactOffset32) + { + Assert(mxoff == (MultiXactOffset) 1 << 32); + Assert(oldseg.segno == MaxMultiXactOffset32 / MULTIXACT_MEMBERS_PER_PAGE_OLD / SLRU_PAGES_PER_SEGMENT_OLD); + Assert(oldseg.pageno == MaxMultiXactOffset32 / MULTIXACT_MEMBERS_PER_PAGE_OLD % SLRU_PAGES_PER_SEGMENT_OLD); + Assert(oldmember == MaxMultiXactOffset32 % MULTIXACT_MEMBERS_PER_PAGE_OLD); + + /* Switch to segment 0000 */ + close_file(&oldseg, slru_filename_old); + oldseg.segno = 0; + oldseg.pageno = 0; + + oldidx = 1; /* skip invalid zero mxid offset */ + } + } + } + } + + /* Write last page, unless it is empty */ + if (newflag > (char *) newbuf || oldest_mxoff == next_mxoff) + write_new_segment_page(&newseg, newbuf, false); + + /* Release resources */ + close_file(&oldseg, slru_filename_old); + close_file(&newseg, slru_filename_new); + + pfree((char *) oldseg.dir); + pfree((char *) newseg.dir); + + check_ok(); +} diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index add6ea9c34..9ebb8d1063 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -84,7 +84,7 @@ my $oldnode = # increasing test runtime, run these tests with a custom setting. # --allow-group-access and --wal-segsize have been added in v11. my %node_params = (); -$node_params{extra} = [ '--wal-segsize', '1', '--allow-group-access' ] +$node_params{extra} = [ '--wal-segsize', '1', '--allow-group-access', '-x', '21000000000' ] if $oldnode->pg_version >= 11; $oldnode->init(%node_params); $oldnode->start; @@ -178,6 +178,14 @@ if (defined($ENV{oldinstall})) 'ran adapt script'); } +$oldnode->safe_psql('regression', + "CREATE TABLE t1 (id SERIAL NOT NULL PRIMARY KEY, plt text, pln NUMERIC(8, 4)); + INSERT INTO t1 (plt, pln) SELECT md5(random()::text), random() * 9999 FROM generate_series(1, 1000);"); +my $relfrozenxid = $oldnode->safe_psql('regression', + "SELECT relfrozenxid FROM pg_class WHERE relname = 't1';"); +my $relminmxid = $oldnode->safe_psql('regression', + "SELECT relminmxid FROM pg_class WHERE relname = 't1';"); + # Take a dump before performing the upgrade as a base comparison. Note # that we need to use pg_dumpall from the new node here. my @dump_command = ( @@ -290,6 +298,16 @@ ok( !-d $newnode->data_dir . "/pg_upgrade_output.d", $newnode->start; +my $relfrozenxid_new = $newnode->safe_psql('regression', + "SELECT relfrozenxid FROM pg_class WHERE relname = 't1';"); + +is($relfrozenxid_new, $relfrozenxid, 'old and new relfrozenxid match after pg_upgrade'); + +my $relminmxid_new = $newnode->safe_psql('regression', + "SELECT relminmxid FROM pg_class WHERE relname = 't1';"); + +is($relminmxid_new, $relminmxid, 'old and new relminmxid match after pg_upgrade'); + # Check if there are any logs coming from pg_upgrade, that would only be # retained on failure. my $log_path = $newnode->data_dir . "/pg_upgrade_output.d"; diff --git a/src/bin/pg_upgrade/version.c b/src/bin/pg_upgrade/version.c index dc19fc6ec8..20763af6d2 100644 --- a/src/bin/pg_upgrade/version.c +++ b/src/bin/pg_upgrade/version.c @@ -9,6 +9,7 @@ #include "postgres_fe.h" +#include "access/transam.h" #include "catalog/pg_class_d.h" #include "fe_utils/string_utils.h" #include "pg_upgrade.h" @@ -242,19 +243,21 @@ old_9_6_check_for_unknown_data_type_usage(ClusterInfo *cluster) } /* - * old_9_6_invalidate_hash_indexes() - * 9.6 -> 10 - * Hash index binary format has changed from 9.6->10.0 + * invalidate_indexes() + * Invalidates all indexes satisfying given predicate. */ -void -old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) +static void +invalidate_indexes(ClusterInfo *cluster, bool check_mode, + const char *name, const char *pred) { int dbnum; FILE *script = NULL; bool found = false; - char *output_path = "reindex_hash.sql"; + char output_path[MAXPGPATH]; + + snprintf(output_path, sizeof(output_path), "reindex_%s.sql", name); - prep_status("Checking for hash indexes"); + prep_status("Checking for %s indexes", name); for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++) { @@ -267,9 +270,16 @@ old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) DbInfo *active_db = &cluster->dbarr.dbs[dbnum]; PGconn *conn = connectToServer(cluster, active_db->db_name); - /* find hash indexes */ - res = executeQueryOrDie(conn, - "SELECT n.nspname, c.relname " + + /* + * Find indexes satisfying predicate. + * + * System indexes (with oids < FirstNormalObjectId) are excluded from + * the search as they are recreated in the new cluster during initdb. + */ + res = executeQueryOrDie( + conn, + "SELECT n.nspname, c.relname, i.indexrelid " "FROM pg_catalog.pg_class c, " " pg_catalog.pg_index i, " " pg_catalog.pg_am a, " @@ -277,8 +287,11 @@ old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) "WHERE i.indexrelid = c.oid AND " " c.relam = a.oid AND " " c.relnamespace = n.oid AND " - " a.amname = 'hash'" - ); + " i.indexrelid >= '%u'::pg_catalog.oid AND " + " %s " + "ORDER BY i.indexrelid ASC", + FirstNormalObjectId, + pred); ntups = PQntuples(res); i_nspname = PQfnumber(res, "nspname"); @@ -311,8 +324,14 @@ old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) if (!check_mode && db_used) { - /* mark hash indexes as invalid */ - PQclear(executeQueryOrDie(conn, + /* + * Mark indexes satisfying predicate as invalid. + * + * System indexes (with oids < FirstNormalObjectId) are excluded + * from the search (see above). + */ + PQclear(executeQueryOrDie( + conn, "UPDATE pg_catalog.pg_index i " "SET indisvalid = false " "FROM pg_catalog.pg_class c, " @@ -321,7 +340,10 @@ old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) "WHERE i.indexrelid = c.oid AND " " c.relam = a.oid AND " " c.relnamespace = n.oid AND " - " a.amname = 'hash'")); + " i.indexrelid >= '%u'::pg_catalog.oid AND " + " %s", + FirstNormalObjectId, + pred)); } PQfinish(conn); @@ -335,24 +357,37 @@ old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) report_status(PG_WARNING, "warning"); if (check_mode) pg_log(PG_WARNING, "\n" - "Your installation contains hash indexes. These indexes have different\n" + "Your installation contains %s indexes. These indexes have different\n" "internal formats between your old and new clusters, so they must be\n" "reindexed with the REINDEX command. After upgrading, you will be given\n" - "REINDEX instructions."); + "REINDEX instructions.", + name); else pg_log(PG_WARNING, "\n" - "Your installation contains hash indexes. These indexes have different\n" + "Your installation contains %s indexes. These indexes have different\n" "internal formats between your old and new clusters, so they must be\n" "reindexed with the REINDEX command. The file\n" " %s\n" "when executed by psql by the database superuser will recreate all invalid\n" "indexes; until then, none of these indexes will be used.", + name, output_path); } else check_ok(); } +/* + * old_9_6_invalidate_hash_indexes() + * 9.6 -> 10 + * Hash index binary format has changed from 9.6->10.0 + */ +void +old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode) +{ + invalidate_indexes(cluster, check_mode, "hash", "a.amname = 'hash'"); +} + /* * old_11_check_for_sql_identifier_data_type_usage() * 11 -> 12 @@ -459,3 +494,36 @@ report_extension_updates(ClusterInfo *cluster) else check_ok(); } + +/* + * invalidate_spgist_indexes() + * 32bit -> 64bit + * SP-GIST contains xids. + */ +void +invalidate_spgist_indexes(ClusterInfo *cluster, bool check_mode) +{ + invalidate_indexes(cluster, check_mode, "spgist", "a.amname = 'spgist'"); +} + +/* + * invalidate_gin_indexes() + * 32bit -> 64bit + * Gin indexes contains xids in deleted pages. + */ +void +invalidate_gin_indexes(ClusterInfo *cluster, bool check_mode) +{ + invalidate_indexes(cluster, check_mode, "gin", "a.amname = 'gin'"); +} + +/* + * invalidate_external_indexes() + * Generate script to REINDEX non standard external indexes (like RUM etc) + */ +void +invalidate_external_indexes(ClusterInfo *cluster, bool check_mode) +{ + invalidate_indexes(cluster, check_mode, "external", + "NOT a.amname IN ('btree', 'hash', 'gist', 'gin', 'spgist', 'brin')"); +} diff --git a/src/bin/pg_verifybackup/t/003_corruption.pl b/src/bin/pg_verifybackup/t/003_corruption.pl index f1ceb4a4bd..f4109471ac 100644 --- a/src/bin/pg_verifybackup/t/003_corruption.pl +++ b/src/bin/pg_verifybackup/t/003_corruption.pl @@ -174,7 +174,7 @@ sub mutilate_extra_tablespace_file sub mutilate_missing_file { my ($backup_path) = @_; - my $pathname = "$backup_path/pg_xact/000000000000"; + my $pathname = "$backup_path/pg_xact/000000123000"; unlink($pathname) || die "$pathname: $!"; return; } diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 5a82cfdab2..cb11ca27f5 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -919,7 +919,7 @@ main(int argc, char **argv) config.filter_by_fpw = true; break; case 'x': - if (sscanf(optarg, "%u", &config.filter_by_xid) != 1) + if (sscanf(optarg, "%" INT64_MODIFIER "u", &config.filter_by_xid) != 1) { pg_log_error("invalid transaction ID specification: \"%s\"", optarg); diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 543f2e2643..73bc172309 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -31,7 +31,7 @@ typedef int XidStatus; typedef struct xl_clog_truncate { - int pageno; + int64 pageno; TransactionId oldestXact; Oid oldestXactDb; } xl_clog_truncate; diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h index 9347f464f3..4db042c319 100644 --- a/src/include/access/ginblock.h +++ b/src/include/access/ginblock.h @@ -133,8 +133,15 @@ typedef struct GinMetaPageData * We should reclaim deleted page only once every transaction started before * its deletion is over. */ -#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) -#define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid) +#define GinPageGetDeleteXid(page) ( \ + (((PageHeader) (page))->pd_upper == BLCKSZ - sizeof(GinPageOpaqueData) - sizeof(TransactionId)) ? \ + *((TransactionId *) ((char *) (page) + BLCKSZ - sizeof(GinPageOpaqueData) - sizeof(TransactionId))) : \ + InvalidTransactionId ) +#define GinPageSetDeleteXid(page, xid) \ + do { \ + ((PageHeader) (page))->pd_upper = BLCKSZ - sizeof(GinPageOpaqueData) - sizeof(TransactionId); \ + *((TransactionId *) ((char *) (page) + BLCKSZ - sizeof(GinPageOpaqueData) - sizeof(TransactionId))) = xid; \ + } while (false) extern bool GinPageIsRecyclable(Page page); /* diff --git a/src/include/access/gist.h b/src/include/access/gist.h index a3337627b8..41de3052fd 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -223,7 +223,7 @@ GistPageGetDeleteXid(Page page) return ((GISTDeletedPageContents *) PageGetContents(page))->deleteXid; } else - return FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); + return FullTransactionIdFromXid(FirstNormalTransactionId); } /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9dab35551e..ed1e7e44f3 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -146,6 +146,8 @@ extern void ReleaseBulkInsertStatePin(BulkInsertState bistate); extern void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate); +extern void rewrite_page_prepare_for_xid(Page page, HeapTuple tup, + bool is_toast); extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate); @@ -164,14 +166,14 @@ extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, struct TM_FailureData *tmfd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); -extern bool heap_freeze_tuple(HeapTupleHeader tuple, +extern bool heap_freeze_tuple(HeapTuple tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi); -extern bool heap_tuple_would_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid, +extern bool heap_tuple_would_freeze(HeapTuple htup, TransactionId cutoff_xid, MultiXactId cutoff_multi, TransactionId *relfrozenxid_out, MultiXactId *relminmxid_out); -extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple); +extern bool heap_tuple_needs_eventual_freeze(HeapTuple htup); extern void simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); @@ -189,12 +191,16 @@ extern int heap_page_prune(Relation relation, Buffer buffer, TransactionId old_snap_xmin, TimestampTz old_snap_ts, int *nnewlpdead, - OffsetNumber *off_loc); + OffsetNumber *off_loc, + bool repairFragmentation); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, - OffsetNumber *nowunused, int nunused); -extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); + OffsetNumber *nowunused, int nunused, + bool repairFragmentation, + bool is_toast); +extern void heap_get_root_tuples(Relation relation, Buffer buffer, Page page, + OffsetNumber *root_offsets); /* in heap/vacuumlazy.c */ struct VacuumParams; @@ -212,7 +218,7 @@ extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer TransactionId *dead_after); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); -extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); +extern bool HeapTupleIsOnlyLocked(HeapTuple htup); extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); extern bool HeapTupleIsSurelyDead(HeapTuple htup, struct GlobalVisState *vistest); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 34220d93cf..54e3e3759a 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -59,6 +59,8 @@ #define XLOG_HEAP2_LOCK_UPDATED 0x60 #define XLOG_HEAP2_NEW_CID 0x70 +#define XLOG_HEAP3_BASE_SHIFT 0x00 + /* * xl_heap_insert/xl_heap_multi_insert flag values, 8 bits are available. */ @@ -98,6 +100,7 @@ #define XLH_DELETE_CONTAINS_OLD_KEY (1<<2) #define XLH_DELETE_IS_SUPER (1<<3) #define XLH_DELETE_IS_PARTITION_MOVE (1<<4) +#define XLH_DELETE_PAGE_ON_TOAST_RELATION (1<<5) /* convenience macro for checking whether any form of old tuple was logged */ #define XLH_DELETE_CONTAINS_OLD \ @@ -240,15 +243,19 @@ typedef struct xl_heap_update * * Acquires a full cleanup lock. */ +#define XLH_PRUNE_ON_TOAST_RELATION 0x01 +#define XLH_PRUNE_REPAIR_FRAGMENTATION 0x02 + typedef struct xl_heap_prune { TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; + uint8 flags; /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_prune; -#define SizeOfHeapPrune (offsetof(xl_heap_prune, ndead) + sizeof(uint16)) +#define SizeOfHeapPrune (offsetof(xl_heap_prune, flags) + sizeof(uint8)) /* * The vacuum page record is similar to the prune record, but can only mark @@ -336,13 +343,16 @@ typedef struct xl_heap_freeze_tuple * Backup block 0's data contains an array of xl_heap_freeze_tuple structs, * one for each tuple. */ +#define XLH_FREEZE_PAGE_ON_TOAST_RELATION 0x01 + typedef struct xl_heap_freeze_page { TransactionId cutoff_xid; uint16 ntuples; + uint8 flags; } xl_heap_freeze_page; -#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16)) +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, flags) + sizeof(uint8)) /* * This is what we need to know about setting a visibility map bit @@ -389,7 +399,19 @@ typedef struct xl_heap_rewrite_mapping XLogRecPtr start_lsn; /* Insert LSN at begin of rewrite */ } xl_heap_rewrite_mapping; -extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, +#define XLH_BASE_SHIFT_ON_TOAST_RELATION 0x01 + +/* shift the base of xids on heap page */ +typedef struct xl_heap_base_shift +{ + int64 delta; /* delta value to shift the base */ + bool multi; /* true to shift multixact base */ + uint8 flags; +} xl_heap_base_shift; + +#define SizeOfHeapBaseShift (offsetof(xl_heap_base_shift, flags) + sizeof(uint8)) + +extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTuple tuple, TransactionId *latestRemovedXid); extern void heap_redo(XLogReaderState *record); @@ -399,12 +421,15 @@ extern void heap_mask(char *pagedata, BlockNumber blkno); extern void heap2_redo(XLogReaderState *record); extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); +extern void heap3_redo(XLogReaderState *record); +extern void heap3_desc(StringInfo buf, XLogReaderState *record); +extern const char *heap3_identify(uint8 info); extern void heap_xlog_logical_rewrite(XLogReaderState *r); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, int ntuples); -extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, +extern bool heap_prepare_freeze_tuple(HeapTuple htup, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, @@ -413,8 +438,11 @@ extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, bool *totally_frozen, TransactionId *relfrozenxid_out, MultiXactId *relminmxid_out); -extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, +extern void heap_execute_freeze_tuple(HeapTuple tuple, xl_heap_freeze_tuple *frz); +extern void heap_execute_freeze_tuple_page(Page page, HeapTupleHeader htup, + xl_heap_freeze_tuple *xlrec_tp, + bool is_toast); extern XLogRecPtr log_heap_visible(RelFileLocator rlocator, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid, uint8 vmflags); diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h index a75699054a..3916a5f05e 100644 --- a/src/include/access/heaptoast.h +++ b/src/include/access/heaptoast.h @@ -20,10 +20,19 @@ /* * Find the maximum size of a tuple if there are to be N tuples per page. */ +#if MAXIMUM_ALIGNOF == 8 #define MaximumBytesPerTuple(tuplesPerPage) \ MAXALIGN_DOWN((BLCKSZ - \ - MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ + MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData)) - MAXALIGN(sizeof(HeapPageSpecialData))) \ / (tuplesPerPage)) +#elif MAXIMUM_ALIGNOF == 4 +#define MaximumBytesPerTuple(tuplesPerPage) \ + MAXALIGN_DOWN((BLCKSZ - \ + MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData)) - MAXALIGN(sizeof(ToastPageSpecialData))) \ + / (tuplesPerPage)) +#else +#error "unknown arch bitness" +#endif /* * These symbols control toaster activation. If a tuple is larger than diff --git a/src/include/access/htup.h b/src/include/access/htup.h index a4bc7256ed..ae61f92471 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -54,6 +54,12 @@ typedef MinimalTupleData *MinimalTuple; * this can't be told apart from case #1 by inspection; code setting up * or destroying this representation has to know what it's doing. * + * t_xmin and t_xmax are TransactionId values stored in heap tuple header. + * Normally they are calculated from ShortTransactionId-sized on-disk tuple + * xmin/xmax representation: + * t_data->t_choice.t_heap.t_xmin/t_data->t_choice.t_heap.t_xmin + * and pd_xid_base and pd_multi_base commmon values for all tuples on a page. + * * t_len should always be valid, except in the pointer-to-nothing case. * t_self and t_tableOid should be valid if the HeapTupleData points to * a disk buffer, or if it represents a copy of a tuple on disk. They @@ -61,10 +67,12 @@ typedef MinimalTupleData *MinimalTuple; */ typedef struct HeapTupleData { + TransactionId t_xmin; /* calculated tuple xmin */ + TransactionId t_xmax; /* calculated tuple xmax */ uint32 t_len; /* length of *t_data */ ItemPointerData t_self; /* SelfItemPointer */ Oid t_tableOid; /* table the tuple came from */ -#define FIELDNO_HEAPTUPLEDATA_DATA 3 +#define FIELDNO_HEAPTUPLEDATA_DATA 5 HeapTupleHeader t_data; /* -> tuple header and data */ } HeapTupleData; @@ -78,12 +86,12 @@ typedef HeapTupleData *HeapTuple; #define HeapTupleIsValid(tuple) PointerIsValid(tuple) /* HeapTupleHeader functions implemented in utils/time/combocid.c */ -extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); -extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); -extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, +extern CommandId HeapTupleGetCmin(HeapTuple tup); +extern CommandId HeapTupleGetCmax(HeapTuple tup); +extern void HeapTupleHeaderAdjustCmax(HeapTuple tup, CommandId *cmax, bool *iscombo); /* Prototype for HeapTupleHeader accessors in heapam.c */ -extern TransactionId HeapTupleGetUpdateXid(HeapTupleHeader tuple); +extern TransactionId HeapTupleGetUpdateXid(HeapTuple tuple); #endif /* HTUP_H */ diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 9561c835f2..8bd9cfb82c 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -19,6 +19,7 @@ #include "access/tupdesc.h" #include "access/tupmacs.h" #include "storage/bufpage.h" +#include "storage/bufmgr.h" /* * MaxTupleAttributeNumber limits the number of (user) columns in a tuple. @@ -120,13 +121,13 @@ typedef struct HeapTupleFields { - TransactionId t_xmin; /* inserting xact ID */ - TransactionId t_xmax; /* deleting or locking xact ID */ + ShortTransactionId t_xmin; /* inserting xact ID */ + ShortTransactionId t_xmax; /* deleting or locking xact ID */ union { CommandId t_cid; /* inserting or deleting command ID, or both */ - TransactionId t_xvac; /* old-style VACUUM FULL xact ID */ + ShortTransactionId t_xvac; /* old-style VACUUM FULL xact ID */ } t_field3; } HeapTupleFields; @@ -222,7 +223,7 @@ struct HeapTupleHeaderData * HEAP_XMAX_LOCK_ONLY bit is set; or, for pg_upgrade's sake, if the Xmax is * not a multi and the EXCL_LOCK bit is set. * - * See also HeapTupleHeaderIsOnlyLocked, which also checks for a possible + * See also HeapTupleIsOnlyLocked, which also checks for a possible * aborted updater transaction. * * Beware of multiple evaluations of the argument. @@ -298,27 +299,85 @@ struct HeapTupleHeaderData */ /* - * HeapTupleHeaderGetRawXmin returns the "raw" xmin field, which is the xid + * Copy base values for xid and multixacts from one heap tuple to heap tuple. + * Should be called on tuple copy or making desc tuple on the base on src tuple + * saving visibility information. + */ +#define HeapTupleCopyBase(dest, src) \ +{ \ + (dest)->t_xmin = (src)->t_xmin; \ + (dest)->t_xmax = (src)->t_xmax; \ +} + +/* + * Set base values for tuple xids/multixacts to zero. Used when visibility + * infromation is negligible or will be set later. + */ +#define HeapTupleSetZeroBase(tup) \ +{ \ + (tup)->t_xmin = 0; \ + (tup)->t_xmax = 0; \ +} + +/* + * Copy HeapTupleHeader xmin/xmax in raw way ??? + */ +#define HeapTupleCopyHeaderXids(tup) \ +{ \ + (tup)->t_xmin = (tup)->t_data->t_choice.t_heap.t_xmin; \ + (tup)->t_xmax = (tup)->t_data->t_choice.t_heap.t_xmax; \ +} + +/* + * Macros for accessing "double xmax". On pg_upgraded instances, it might + * happend that we can't fit new special area to the page. But we still + * might neep to write xmax of tuples for updates and deletes. The trick is + * that we actually don't need xmin field. After pg_upgrade (wich implies + * restart) no insertions went to this page yet (otherwise special area could + * fit). So, if tuple is visible (othewise it would be deleted), then it's + * visible for everybody. Thus, t_xmin isn't needed. Therefore, we can use + * both t_xmin and t_xmax to store 64-bit xmax. + * + * See heap_convert.c for details. + */ +#define HeapTupleHeaderGetDoubleXmax(tup) \ + ((TransactionId)(tup)->t_choice.t_heap.t_xmax + \ + ((TransactionId)(tup)->t_choice.t_heap.t_xmin << 32)) + +#define HeapTupleHeaderSetDoubleXmax(tup, xid) \ +do { \ + (tup)->t_choice.t_heap.t_xmax = (TransactionId) (xid) & 0xFFFFFFFF; \ + (tup)->t_choice.t_heap.t_xmin = ((TransactionId) (xid) >> 32) & 0xFFFFFFFF; \ +} while (0) + +/* + * HeapTupleGetRawXmin returns the "raw" xmin field, which is the xid * originally used to insert the tuple. However, the tuple might actually * be frozen (via HeapTupleHeaderSetXminFrozen) in which case the tuple's xmin * is visible to every snapshot. Prior to PostgreSQL 9.4, we actually changed * the xmin to FrozenTransactionId, and that value may still be encountered * on disk. */ -#define HeapTupleHeaderGetRawXmin(tup) \ +#define HeapTupleGetRawXmin(tup) ((tup)->t_xmin) + +#define HeapTupleGetXmin(tup) \ ( \ - (tup)->t_choice.t_heap.t_xmin \ + HeapTupleHeaderXminFrozen((tup)->t_data) ? \ + FrozenTransactionId : HeapTupleGetRawXmin(tup) \ ) -#define HeapTupleHeaderGetXmin(tup) \ +#define HeapTupleSetXmin(tup, xid) ((tup)->t_xmin = (xid)) + +#define HeapTupleHeaderSetXmin(page, tup) \ ( \ - HeapTupleHeaderXminFrozen(tup) ? \ - FrozenTransactionId : HeapTupleHeaderGetRawXmin(tup) \ + AssertMacro(!HeapPageIsDoubleXmax(page)), \ + (tup)->t_data->t_choice.t_heap.t_xmin = NormalTransactionIdToShort(HeapPageGetSpecial(page)->pd_xid_base, (tup)->t_xmin) \ ) -#define HeapTupleHeaderSetXmin(tup, xid) \ +#define ToastTupleHeaderSetXmin(page, tup) \ ( \ - (tup)->t_choice.t_heap.t_xmin = (xid) \ + AssertMacro(!HeapPageIsDoubleXmax(page)), \ + (tup)->t_data->t_choice.t_heap.t_xmin = NormalTransactionIdToShort(ToastPageGetSpecial(page)->pd_xid_base, (tup)->t_xmin) \ ) #define HeapTupleHeaderXminCommitted(tup) \ @@ -337,18 +396,6 @@ struct HeapTupleHeaderData ((tup)->t_infomask & (HEAP_XMIN_FROZEN)) == HEAP_XMIN_FROZEN \ ) -#define HeapTupleHeaderSetXminCommitted(tup) \ -( \ - AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \ - ((tup)->t_infomask |= HEAP_XMIN_COMMITTED) \ -) - -#define HeapTupleHeaderSetXminInvalid(tup) \ -( \ - AssertMacro(!HeapTupleHeaderXminCommitted(tup)), \ - ((tup)->t_infomask |= HEAP_XMIN_INVALID) \ -) - #define HeapTupleHeaderSetXminFrozen(tup) \ ( \ AssertMacro(!HeapTupleHeaderXminInvalid(tup)), \ @@ -362,30 +409,67 @@ struct HeapTupleHeaderData * to resolve the MultiXactId if necessary. This might involve multixact I/O, * so it should only be used if absolutely necessary. */ -#define HeapTupleHeaderGetUpdateXid(tup) \ +#define HeapTupleGetUpdateXidAny(tup) \ ( \ - (!((tup)->t_infomask & HEAP_XMAX_INVALID) && \ - ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \ - !((tup)->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \ + (!((tup)->t_data->t_infomask & HEAP_XMAX_INVALID) && \ + ((tup)->t_data->t_infomask & HEAP_XMAX_IS_MULTI) && \ + !((tup)->t_data->t_infomask & HEAP_XMAX_LOCK_ONLY)) ? \ HeapTupleGetUpdateXid(tup) \ : \ - HeapTupleHeaderGetRawXmax(tup) \ + HeapTupleGetRawXmax(tup) \ ) -#define HeapTupleHeaderGetRawXmax(tup) \ +#define HeapTupleGetRawXmax(tup) ((tup)->t_xmax) + +#define HeapTupleHeaderGetRawXmax(page, tup) \ ( \ - (tup)->t_choice.t_heap.t_xmax \ + HeapPageIsDoubleXmax(page) ? \ + HeapTupleHeaderGetDoubleXmax(tup) : \ + ShortTransactionIdToNormal( \ + ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) ? HeapPageGetSpecial(page)->pd_multi_base : HeapPageGetSpecial(page)->pd_xid_base, \ + (tup)->t_choice.t_heap.t_xmax) \ ) -#define HeapTupleHeaderSetXmax(tup, xid) \ +#define ToastTupleHeaderGetRawXmax(page, tup) \ ( \ - (tup)->t_choice.t_heap.t_xmax = (xid) \ + HeapPageIsDoubleXmax(page) ? \ + HeapTupleHeaderGetDoubleXmax(tup) : \ + ShortTransactionIdToNormal( \ + ToastPageGetSpecial(page)->pd_xid_base, \ + (tup)->t_choice.t_heap.t_xmax) \ ) +#define HeapTupleSetXmax(tup, xid) \ +do { \ + (tup)->t_xmax = (xid); \ +} while (0) + +#define HeapTupleHeaderSetXmax(page, tup) \ +do { \ + if (HeapPageIsDoubleXmax(page)) \ + HeapTupleHeaderSetDoubleXmax((tup)->t_data, (tup)->t_xmax); \ + else \ + (tup)->t_data->t_choice.t_heap.t_xmax = \ + NormalTransactionIdToShort( \ + ((tup)->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ? HeapPageGetSpecial(page)->pd_multi_base : HeapPageGetSpecial(page)->pd_xid_base, \ + ((tup)->t_xmax)); \ +} while (0) + +#define ToastTupleHeaderSetXmax(page, tup) \ +do { \ + if (HeapPageIsDoubleXmax(page)) \ + HeapTupleHeaderSetDoubleXmax((tup)->t_data, (tup)->t_xmax); \ + else \ + (tup)->t_data->t_choice.t_heap.t_xmax = \ + NormalTransactionIdToShort( \ + ToastPageGetSpecial(page)->pd_xid_base, \ + ((tup)->t_xmax)); \ +} while (0) + /* * HeapTupleHeaderGetRawCommandId will give you what's in the header whether - * it is useful or not. Most code should use HeapTupleHeaderGetCmin or - * HeapTupleHeaderGetCmax instead, but note that those Assert that you can + * it is useful or not. Most code should use HeapTupleGetCmin or + * HeapTupleGetCmax instead, but note that those Assert that you can * get a legitimate result, ie you are in the originating transaction! */ #define HeapTupleHeaderGetRawCommandId(tup) \ @@ -555,8 +639,16 @@ do { \ * an otherwise-empty page can indeed hold a tuple of this size. Because * ItemIds and tuples have different alignment requirements, don't assume that * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. + * + * On shift to 64-bit XIDs MaxHeapTupleSize decreased by sizeof(HeapPageSpecialData). + * Extant tuples with length over new MaxHeapTupleSize are inherited on DoubleXmax + * pages. They could be read, but can not be updated unless their length decreases + * to fit MaxHeapTupleSize. Vacuum full will also copy these double xmax pages + * without change. */ -#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) + +#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData)) - MAXALIGN(sizeof(HeapPageSpecialData))) +#define MaxHeapTupleSize_32 (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) #define MinHeapTupleSize MAXALIGN(SizeofHeapTupleHeader) /* @@ -690,6 +782,79 @@ struct MinimalTupleData #define HeapTupleClearHeapOnly(tuple) \ HeapTupleHeaderClearHeapOnly((tuple)->t_data) +static inline void +HeapTupleCopyXminFromPage(HeapTuple tup, Page page, bool is_toast) +{ + TransactionId base; + ShortTransactionId xmin; /* short xmin from tuple header */ + + if (HeapTupleHeaderXminFrozen(tup->t_data)) + { + tup->t_xmin = FrozenTransactionId; + return; + } + + xmin = tup->t_data->t_choice.t_heap.t_xmin; + + if (!TransactionIdIsNormal(xmin)) + base = 0; + else if (is_toast) + base = ToastPageGetSpecial(page)->pd_xid_base; + else + base = HeapPageGetSpecial(page)->pd_xid_base; + + tup->t_xmin = ShortTransactionIdToNormal(base, xmin); +} + +static inline void +HeapTupleCopyXmaxFromPage(HeapTuple tup, Page page, bool is_toast) +{ + TransactionId base; + ShortTransactionId xmax; /* short xmax from tuple header */ + + xmax = tup->t_data->t_choice.t_heap.t_xmax; + + if (!TransactionIdIsNormal(xmax)) + base = 0; + else if (is_toast) + /* + * Toast page is not expected to have multixacts in chunks and + * has shorter special. + */ + base = ToastPageGetSpecial(page)->pd_xid_base; + else if (tup->t_data->t_infomask & HEAP_XMAX_IS_MULTI) + base = HeapPageGetSpecial(page)->pd_multi_base; + else + base = HeapPageGetSpecial(page)->pd_xid_base; + + tup->t_xmax = ShortTransactionIdToNormal(base, xmax); +} + +/* + * Copy base values for xid and multixacts from page to heap tuple. Should be + * called each time tuple is read from page. Otherwise, it would be impossible + * to correctly read tuple xmin and xmax. + */ +static inline void +HeapTupleCopyBaseFromPage(Buffer buffer, HeapTuple tup, Page page, + bool is_toast) +{ + Assert(IsBufferLocked(buffer)); + + if (HeapPageIsDoubleXmax(page)) + { + /* + * On double xmax pages, xmax is extracted from tuple header. + */ + tup->t_xmin = FrozenTransactionId; + tup->t_xmax = HeapTupleHeaderGetDoubleXmax(tup->t_data); + return; + } + + HeapTupleCopyXminFromPage(tup, page, is_toast); + HeapTupleCopyXmaxFromPage(tup, page, is_toast); +} + /* prototypes for functions in common/heaptuple.c */ extern Size heap_compute_data_size(TupleDesc tupleDesc, Datum *values, bool *isnull); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 4cbe17de7b..980f8bb747 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -18,16 +18,16 @@ /* * The first two MultiXactId values are reserved to store the truncation Xid - * and epoch of the first segment, so we start assigning multixact values from + * and base of the first segment, so we start assigning multixact values from * 2. */ -#define InvalidMultiXactId ((MultiXactId) 0) -#define FirstMultiXactId ((MultiXactId) 1) -#define MaxMultiXactId ((MultiXactId) 0xFFFFFFFF) +#define InvalidMultiXactId UINT64CONST(0) +#define FirstMultiXactId UINT64CONST(1) +#define MaxMultiXactId UINT64CONST(0xFFFFFFFFFFFFFFFF) #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) +#define MaxMultiXactOffset UINT64CONST(0xFFFFFFFFFFFFFFFF) /* Number of SLRU buffers to use for multixact */ #define NUM_MULTIXACTOFFSET_BUFFERS 8 @@ -147,7 +147,6 @@ extern void MultiXactSetNextMXact(MultiXactId nextMulti, extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); -extern int MultiXactMemberFreezeThreshold(void); extern void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8e4f6864e5..72b4ff5c52 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -62,8 +62,10 @@ typedef uint16 BTCycleId; typedef struct BTPageOpaqueData { BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + /* ... or next transaction ID (lower part) */ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ uint32 btpo_level; /* tree level --- zero for leaf pages */ + /* ... or next transaction ID (lower part) */ uint16 btpo_flags; /* flag bits, see below */ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ } BTPageOpaqueData; @@ -92,6 +94,14 @@ typedef BTPageOpaqueData *BTPageOpaque; */ #define MAX_BT_CYCLE_ID 0xFF7F +/* Macros for access xact */ +#define BTP_GET_XACT(opaque) (((uint64) ((BTPageOpaque) opaque)->btpo_prev << 32) | \ + (uint64) ((BTPageOpaque) opaque)->btpo_level) +#define BTP_SET_XACT(opaque, xact) \ +do { \ + ((BTPageOpaque) opaque)->btpo_prev = (uint32) (xact >> 32); \ + ((BTPageOpaque) opaque)->btpo_level = (uint32) xact; \ +} while (0) /* * The Meta page is always the first page in the btree index. diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h index 5cc04756a5..cd7913858f 100644 --- a/src/include/access/rewriteheap.h +++ b/src/include/access/rewriteheap.h @@ -51,7 +51,7 @@ typedef struct LogicalRewriteMappingData * 6) xid of the xact performing the mapping * --- */ -#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x-%x" -extern void CheckPointLogicalRewriteHeap(void); +#define LOGICAL_REWRITE_FORMAT "map-%x-%x-%X_%X-%x_%x-%x_%x" +extern void CheckPointLogicalRewriteHeap(void); #endif /* REWRITE_HEAP_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 000bcbfdaf..8200ccff3e 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +PG_RMGR(RM_HEAP3_ID, "Heap3", heap3_redo, heap3_desc, heap3_identify, NULL, NULL, heap_mask, NULL) diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 4f5a324da2..767854419d 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -21,15 +21,7 @@ /* * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere * else in Postgres. The segment size can be chosen somewhat arbitrarily; - * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG - * or 64K transactions for SUBTRANS. - * - * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, - * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where - * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at - * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in slru.c, except when comparing - * segment and page numbers in SimpleLruTruncate (see PagePrecedes()). + * we make it 32 pages by default. */ #define SLRU_PAGES_PER_SEGMENT 32 diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e45d73eae3..6f89426e57 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -118,7 +118,7 @@ typedef enum TM_Result * cmax is the outdating command's CID, but only when the failure code is * TM_SelfModified (i.e., something in the current transaction outdated the * tuple); otherwise cmax is zero. (We make this restriction because - * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other + * HeapTupleGetCmax doesn't work for tuples outdated in other * transactions.) */ typedef struct TM_FailureData diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 775471d2a7..e7cf1206df 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -17,6 +17,10 @@ #include "access/xlogdefs.h" +#ifndef FRONTEND +#include "utils/elog.h" +#endif + /* ---------------- * Special transaction ID values * @@ -28,11 +32,12 @@ * Note: if you need to change it, you must change pg_class.h as well. * ---------------- */ -#define InvalidTransactionId ((TransactionId) 0) -#define BootstrapTransactionId ((TransactionId) 1) -#define FrozenTransactionId ((TransactionId) 2) -#define FirstNormalTransactionId ((TransactionId) 3) -#define MaxTransactionId ((TransactionId) 0xFFFFFFFF) +#define InvalidTransactionId UINT64CONST(0) +#define BootstrapTransactionId UINT64CONST(1) +#define FrozenTransactionId UINT64CONST(2) +#define FirstNormalTransactionId UINT64CONST(3) +#define MaxTransactionId UINT64CONST(0xFFFFFFFFFFFFFFFF) +#define MaxShortTransactionId ((TransactionId) 0x7FFFFFFF) /* ---------------- * transaction ID manipulation macros @@ -44,17 +49,48 @@ #define TransactionIdStore(xid, dest) (*(dest) = (xid)) #define StoreInvalidTransactionId(dest) (*(dest) = InvalidTransactionId) -#define EpochFromFullTransactionId(x) ((uint32) ((x).value >> 32)) -#define XidFromFullTransactionId(x) ((uint32) (x).value) -#define U64FromFullTransactionId(x) ((x).value) +/* + * Convert short xid from/to full xid. Assertion should fail if we full xid + * doesn't fit to xid base. + */ +static inline TransactionId +ShortTransactionIdToNormal(TransactionId base, ShortTransactionId xid) +{ + if (!TransactionIdIsNormal(xid)) + return (TransactionId) xid; + +#ifndef FRONTEND + /* xid + base should not overflow TransactionId */ + Assert(xid + base >= base); +#endif + + return (TransactionId) (xid + base); +} + +static inline ShortTransactionId +NormalTransactionIdToShort(TransactionId base, TransactionId xid) +{ + if (!TransactionIdIsNormal(xid)) + return (ShortTransactionId) (xid); + +#ifndef FRONTEND + /* xid should fit ShortTransactionId */ + Assert(xid >= base + FirstNormalTransactionId && + xid <= base + MaxShortTransactionId); +#endif + + return (ShortTransactionId) (xid - base); +} + +#define XidFromFullTransactionId(x) ((x).value) #define FullTransactionIdEquals(a, b) ((a).value == (b).value) #define FullTransactionIdPrecedes(a, b) ((a).value < (b).value) #define FullTransactionIdPrecedesOrEquals(a, b) ((a).value <= (b).value) #define FullTransactionIdFollows(a, b) ((a).value > (b).value) #define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value) #define FullTransactionIdIsValid(x) TransactionIdIsValid(XidFromFullTransactionId(x)) -#define InvalidFullTransactionId FullTransactionIdFromEpochAndXid(0, InvalidTransactionId) -#define FirstNormalFullTransactionId FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId) +#define InvalidFullTransactionId FullTransactionIdFromXid(InvalidTransactionId) +#define FirstNormalFullTransactionId FullTransactionIdFromXid(FirstNormalTransactionId) #define FullTransactionIdIsNormal(x) FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId) /* @@ -68,21 +104,11 @@ typedef struct FullTransactionId } FullTransactionId; static inline FullTransactionId -FullTransactionIdFromEpochAndXid(uint32 epoch, TransactionId xid) -{ - FullTransactionId result; - - result.value = ((uint64) epoch) << 32 | xid; - - return result; -} - -static inline FullTransactionId -FullTransactionIdFromU64(uint64 value) +FullTransactionIdFromXid(TransactionId xid) { FullTransactionId result; - result.value = value; + result.value = xid; return result; } @@ -91,8 +117,7 @@ FullTransactionIdFromU64(uint64 value) #define TransactionIdAdvance(dest) \ do { \ (dest)++; \ - if ((dest) < FirstNormalTransactionId) \ - (dest) = FirstNormalTransactionId; \ + Assert(TransactionIdIsNormal(dest)); \ } while(0) /* @@ -140,18 +165,19 @@ FullTransactionIdAdvance(FullTransactionId *dest) /* back up a transaction ID variable, handling wraparound correctly */ #define TransactionIdRetreat(dest) \ do { \ + Assert(TransactionIdIsNormal(dest)); \ (dest)--; \ - } while ((dest) < FirstNormalTransactionId) + } while(0) /* compare two XIDs already known to be normal; this is a macro for speed */ #define NormalTransactionIdPrecedes(id1, id2) \ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ - (int32) ((id1) - (id2)) < 0) + (int64) ((id1) - (id2)) < 0) /* compare two XIDs already known to be normal; this is a macro for speed */ #define NormalTransactionIdFollows(id1, id2) \ (AssertMacro(TransactionIdIsNormal(id1) && TransactionIdIsNormal(id2)), \ - (int32) ((id1) - (id2)) > 0) + (int64) ((id1) - (id2)) > 0) /* ---------- * Object ID (OID) zero is InvalidOid. @@ -221,9 +247,6 @@ typedef struct VariableCacheData TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ TransactionId xidVacLimit; /* start forcing autovacuums here */ - TransactionId xidWarnLimit; /* start complaining here */ - TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */ - TransactionId xidWrapLimit; /* where the world ends */ Oid oldestXidDB; /* database with minimum datfrozenxid */ /* @@ -276,10 +299,6 @@ extern bool TransactionIdDidAbort(TransactionId transactionId); extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids); extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn); extern void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids); -extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2); -extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2); -extern bool TransactionIdFollows(TransactionId id1, TransactionId id2); -extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2); extern TransactionId TransactionIdLatest(TransactionId mainxid, int nxids, const TransactionId *xids); extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid); @@ -319,7 +338,7 @@ ReadNextTransactionId(void) /* return transaction ID backed up by amount, handling wraparound correctly */ static inline TransactionId -TransactionIdRetreatedBy(TransactionId xid, uint32 amount) +TransactionIdRetreatedBy(TransactionId xid, uint64 amount) { xid -= amount; diff --git a/src/include/access/tupmacs.h b/src/include/access/tupmacs.h index 8b24cd3658..db8dae182d 100644 --- a/src/include/access/tupmacs.h +++ b/src/include/access/tupmacs.h @@ -131,10 +131,11 @@ fetch_att(const void *T, bool attbyval, int attlen) ((attalign) == TYPALIGN_INT) ? INTALIGN(cur_offset) : \ (((attalign) == TYPALIGN_CHAR) ? (uintptr_t) (cur_offset) : \ (((attalign) == TYPALIGN_DOUBLE) ? DOUBLEALIGN(cur_offset) : \ + (((attalign) == TYPALIGN_XID) ? MAXALIGN(cur_offset) : \ ( \ AssertMacro((attalign) == TYPALIGN_SHORT), \ SHORTALIGN(cur_offset) \ - ))) \ + )))) \ ) /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index c604ee11f8..3e39b01527 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -242,7 +242,7 @@ typedef struct xl_xact_xinfo * Commit records can be large, so copying large portions isn't * attractive. */ - uint32 xinfo; + uint64 xinfo; } xl_xact_xinfo; typedef struct xl_xact_dbinfo @@ -295,7 +295,12 @@ typedef struct xl_xact_invals typedef struct xl_xact_twophase { - TransactionId xid; + /* + * TransactionId is split into 32-bit parts because xl_xact_twophase is + * only int-aligned. + */ + uint32 xid_lo; + uint32 xid_hi; } xl_xact_twophase; typedef struct xl_xact_origin @@ -314,7 +319,7 @@ typedef struct xl_xact_commit /* xl_xact_relfilelocators follows if XINFO_HAS_RELFILELOCATORS */ /* xl_xact_stats_items follows if XINFO_HAS_DROPPED_STATS */ /* xl_xact_invals follows if XINFO_HAS_INVALS */ - /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE (xid is int-aligned!) */ /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */ /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ } xl_xact_commit; @@ -330,7 +335,7 @@ typedef struct xl_xact_abort /* xl_xact_relfilelocators follows if XINFO_HAS_RELFILELOCATORS */ /* xl_xact_stats_items follows if XINFO_HAS_DROPPED_STATS */ /* No invalidation messages needed. */ - /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */ + /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE (xid is int-aligned!) */ /* twophase_gid follows if XINFO_HAS_GID. As a null-terminated string. */ /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */ } xl_xact_abort; diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 001ff2f521..6694551c71 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -37,6 +37,7 @@ * will be skipped) */ #define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image * is taken */ +#define REGBUF_CONVERTED 0x20 /* buffer had format convertion */ /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index e87f91316a..222c15dc24 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -426,10 +426,6 @@ extern bool DecodeXLogRecord(XLogReaderState *state, #define XLogRecBlockImageApply(decoder, block_id) \ ((decoder)->record->blocks[block_id].apply_image) -#ifndef FRONTEND -extern FullTransactionId XLogRecGetFullXid(XLogReaderState *record); -#endif - extern bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page); extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); extern void XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 835151ec92..9eab0f53eb 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -41,18 +41,17 @@ typedef struct XLogRecord { uint32 xl_tot_len; /* total len of entire record */ + pg_crc32c xl_crc; /* CRC for this record */ TransactionId xl_xid; /* xact id */ XLogRecPtr xl_prev; /* ptr to previous record in log */ uint8 xl_info; /* flag bits, see below */ RmgrId xl_rmid; /* resource manager for this record */ - /* 2 bytes of padding here, initialize to zero */ - pg_crc32c xl_crc; /* CRC for this record */ /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */ } XLogRecord; -#define SizeOfXLogRecord (offsetof(XLogRecord, xl_crc) + sizeof(pg_crc32c)) +#define SizeOfXLogRecord (offsetof(XLogRecord, xl_rmid) + sizeof(RmgrId)) /* * The high 4 bits in xl_info may be used freely by rmgr. The diff --git a/src/include/c.h b/src/include/c.h index c8f72e44d8..2aea4dadb3 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -75,6 +75,10 @@ #include #endif +#if HAVE_INTTYPES_H +#include "inttypes.h" +#endif + /* ---------------------------------------------------------------- * Section 1: compiler characteristics @@ -585,19 +589,29 @@ typedef double float8; typedef Oid regproc; typedef regproc RegProcedure; -typedef uint32 TransactionId; +typedef uint64 TransactionId; -typedef uint32 LocalTransactionId; +extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2); +extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2); +extern bool TransactionIdFollows(TransactionId id1, TransactionId id2); +extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2); -typedef uint32 SubTransactionId; +typedef uint32 ShortTransactionId; +typedef uint64 LocalTransactionId; +typedef uint64 SubTransactionId; -#define InvalidSubTransactionId ((SubTransactionId) 0) -#define TopSubTransactionId ((SubTransactionId) 1) +#define InvalidSubTransactionId ((SubTransactionId) 0) +#define TopSubTransactionId ((SubTransactionId) 1) /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; + +#define MAX_START_XID UINT64CONST(0x3FFFFFFFFFFFFFFF) /* 2^62 - 1 */ +#define StartTransactionIdIsValid(xid) ((xid) <= MAX_START_XID) +#define StartMultiXactIdIsValid(mxid) ((mxid) <= MAX_START_XID) +#define StartMultiXactOffsetIsValid(mxoff) ((mxoff) <= MAX_START_XID) typedef uint32 CommandId; @@ -771,7 +785,6 @@ typedef NameData *Name; /* we don't currently need wider versions of the other ALIGN macros */ #define MAXALIGN64(LEN) TYPEALIGN64(MAXIMUM_ALIGNOF, (LEN)) - /* ---------------------------------------------------------------- * Section 6: assertions * ---------------------------------------------------------------- diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index c1af6eaf5f..c89ba99071 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,7 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202209291 +/* XXX: should de changed to actual version on commit */ +#define CATALOG_VERSION_NO 999999999 #endif diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 4cc129bebd..4f20977635 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -403,9 +403,9 @@ amprocrighttype => 'bytea', amprocnum => '2', amproc => 'hashvarlenaextended' }, { amprocfamily => 'hash/xid_ops', amproclefttype => 'xid', - amprocrighttype => 'xid', amprocnum => '1', amproc => 'hashint4' }, + amprocrighttype => 'xid', amprocnum => '1', amproc => 'hashint8' }, { amprocfamily => 'hash/xid_ops', amproclefttype => 'xid', - amprocrighttype => 'xid', amprocnum => '2', amproc => 'hashint4extended' }, + amprocrighttype => 'xid', amprocnum => '2', amproc => 'hashint8extended' }, { amprocfamily => 'hash/xid8_ops', amproclefttype => 'xid8', amprocrighttype => 'xid8', amprocnum => '1', amproc => 'hashint8' }, { amprocfamily => 'hash/xid8_ops', amproclefttype => 'xid8', diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06368e2366..ab02a0896f 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -247,4 +247,10 @@ typedef struct ControlFileData */ #define PG_CONTROL_FILE_SIZE 8192 +#define CONTROLFILE_GET_OLDEDITION(control) \ + ((control)->pg_old_version >> 16) + +#define CONTROLFILE_SET_OLDEDITION(control, v) \ + (control)->pg_old_version = ((v) << 16) + #endif /* PG_CONTROL_H */ diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat index bc5f8213f3..912ab48ecf 100644 --- a/src/include/catalog/pg_operator.dat +++ b/src/include/catalog/pg_operator.dat @@ -183,16 +183,16 @@ oprresult => 'bool', oprcom => '=(xid,xid)', oprnegate => '<>(xid,xid)', oprcode => 'xideq', oprrest => 'eqsel', oprjoin => 'eqjoinsel' }, { oid => '353', descr => 'equal', - oprname => '=', oprleft => 'xid', oprright => 'int4', oprresult => 'bool', - oprnegate => '<>(xid,int4)', oprcode => 'xideqint4', oprrest => 'eqsel', + oprname => '=', oprleft => 'xid', oprright => 'int8', oprresult => 'bool', + oprnegate => '<>(xid,int8)', oprcode => 'xideqint8', oprrest => 'eqsel', oprjoin => 'eqjoinsel' }, { oid => '3315', descr => 'not equal', oprname => '<>', oprleft => 'xid', oprright => 'xid', oprresult => 'bool', oprcom => '<>(xid,xid)', oprnegate => '=(xid,xid)', oprcode => 'xidneq', oprrest => 'neqsel', oprjoin => 'neqjoinsel' }, { oid => '3316', descr => 'not equal', - oprname => '<>', oprleft => 'xid', oprright => 'int4', oprresult => 'bool', - oprnegate => '=(xid,int4)', oprcode => 'xidneqint4', oprrest => 'neqsel', + oprname => '<>', oprleft => 'xid', oprright => 'int8', oprresult => 'bool', + oprnegate => '=(xid,int8)', oprcode => 'xidneqint8', oprrest => 'neqsel', oprjoin => 'neqjoinsel' }, { oid => '5068', descr => 'equal', oprname => '=', oprcanmerge => 't', oprcanhash => 't', oprleft => 'xid8', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 68bb032d3e..58e6ceb80a 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -2369,10 +2369,10 @@ { oid => '1181', descr => 'age of a transaction ID, in transactions before current transaction', proname => 'age', provolatile => 's', proparallel => 'r', - prorettype => 'int4', proargtypes => 'xid', prosrc => 'xid_age' }, + prorettype => 'int8', proargtypes => 'xid', prosrc => 'xid_age' }, { oid => '3939', descr => 'age of a multi-transaction ID, in multi-transactions before current multi-transaction', - proname => 'mxid_age', provolatile => 's', prorettype => 'int4', + proname => 'mxid_age', provolatile => 's', prorettype => 'int8', proargtypes => 'xid', prosrc => 'mxid_age' }, { oid => '1188', @@ -2707,11 +2707,11 @@ prosrc => 'bpcharlen' }, { oid => '1319', - proname => 'xideqint4', proleakproof => 't', prorettype => 'bool', - proargtypes => 'xid int4', prosrc => 'xideq' }, + proname => 'xideqint8', proleakproof => 't', prorettype => 'bool', + proargtypes => 'xid int8', prosrc => 'xideq' }, { oid => '3309', - proname => 'xidneqint4', proleakproof => 't', prorettype => 'bool', - proargtypes => 'xid int4', prosrc => 'xidneq' }, + proname => 'xidneqint8', proleakproof => 't', prorettype => 'bool', + proargtypes => 'xid int8', prosrc => 'xidneq' }, { oid => '1326', proname => 'interval_div', prorettype => 'interval', diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat index df45879463..9ecd608aa9 100644 --- a/src/include/catalog/pg_type.dat +++ b/src/include/catalog/pg_type.dat @@ -95,9 +95,9 @@ typinput => 'tidin', typoutput => 'tidout', typreceive => 'tidrecv', typsend => 'tidsend', typalign => 's' }, { oid => '28', array_type_oid => '1011', descr => 'transaction id', - typname => 'xid', typlen => '4', typbyval => 't', typcategory => 'U', + typname => 'xid', typlen => '8', typbyval => 'FLOAT8PASSBYVAL', typcategory => 'U', typinput => 'xidin', typoutput => 'xidout', typreceive => 'xidrecv', - typsend => 'xidsend', typalign => 'i' }, + typsend => 'xidsend', typalign => 'x' }, { oid => '29', array_type_oid => '1012', descr => 'command identifier type, sequence in transaction id', typname => 'cid', typlen => '4', typbyval => 't', typcategory => 'U', diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index 48a2559137..71f5f547f4 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -300,6 +300,11 @@ DECLARE_UNIQUE_INDEX(pg_type_typname_nsp_index, 2704, TypeNameNspIndexId, on pg_ #define TYPALIGN_SHORT 's' /* short alignment (typically 2 bytes) */ #define TYPALIGN_INT 'i' /* int alignment (typically 4 bytes) */ #define TYPALIGN_DOUBLE 'd' /* double alignment (often 8 bytes) */ +/* + * We need to use alignment sutable for 8-byte XID values. + * On system like AIX double alignment (4 bytes) is not enough. + */ +#define TYPALIGN_XID 'x' #define TYPSTORAGE_PLAIN 'p' /* type not prepared for toasting */ #define TYPSTORAGE_EXTERNAL 'e' /* toastable, don't try to compress */ diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 5d816ba7f4..7255b2eeda 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -213,12 +213,12 @@ typedef enum VacOptValue */ typedef struct VacuumParams { - bits32 options; /* bitmask of VACOPT_* */ - int freeze_min_age; /* min freeze age, -1 to use default */ - int freeze_table_age; /* age at which to scan whole table */ - int multixact_freeze_min_age; /* min multixact freeze age, -1 to + bits32 options; /* bitmask of VacuumOption */ + int64 freeze_min_age; /* min freeze age, -1 to use default */ + int64 freeze_table_age; /* age at which to scan whole table */ + int64 multixact_freeze_min_age; /* min multixact freeze age, -1 to * use default */ - int multixact_freeze_table_age; /* multixact age at which to scan + int64 multixact_freeze_table_age; /* multixact age at which to scan * whole table */ bool is_wraparound; /* force a for-wraparound vacuum */ int log_min_duration; /* minimum execution threshold in ms at @@ -252,12 +252,12 @@ typedef struct VacDeadItems /* GUC parameters */ extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ -extern PGDLLIMPORT int vacuum_freeze_min_age; -extern PGDLLIMPORT int vacuum_freeze_table_age; -extern PGDLLIMPORT int vacuum_multixact_freeze_min_age; -extern PGDLLIMPORT int vacuum_multixact_freeze_table_age; -extern PGDLLIMPORT int vacuum_failsafe_age; -extern PGDLLIMPORT int vacuum_multixact_failsafe_age; +extern PGDLLIMPORT int64 vacuum_freeze_min_age; +extern PGDLLIMPORT int64 vacuum_freeze_table_age; +extern PGDLLIMPORT int64 vacuum_multixact_freeze_min_age; +extern PGDLLIMPORT int64 vacuum_multixact_freeze_table_age; +extern PGDLLIMPORT int64 vacuum_failsafe_age; +extern PGDLLIMPORT int64 vacuum_multixact_failsafe_age; /* Variables for cost-based parallel vacuum */ extern PGDLLIMPORT pg_atomic_uint32 *VacuumSharedCostBalance; @@ -287,10 +287,10 @@ extern void vac_update_relstats(Relation relation, bool *minmulti_updated, bool in_outer_xact); extern bool vacuum_set_xid_limits(Relation rel, - int freeze_min_age, - int multixact_freeze_min_age, - int freeze_table_age, - int multixact_freeze_table_age, + int64 freeze_min_age, + int64 multixact_freeze_min_age, + int64 freeze_table_age, + int64 multixact_freeze_table_age, TransactionId *oldestXmin, MultiXactId *oldestMxact, TransactionId *freezeLimit, diff --git a/src/include/fmgr.h b/src/include/fmgr.h index 380a82b9de..3711cac3da 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -281,6 +281,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum); #define PG_GETARG_FLOAT4(n) DatumGetFloat4(PG_GETARG_DATUM(n)) #define PG_GETARG_FLOAT8(n) DatumGetFloat8(PG_GETARG_DATUM(n)) #define PG_GETARG_INT64(n) DatumGetInt64(PG_GETARG_DATUM(n)) +#define PG_GETARG_TRANSACTIONID(n) DatumGetTransactionId(PG_GETARG_DATUM(n)) /* use this if you want the raw, possibly-toasted input datum: */ #define PG_GETARG_RAW_VARLENA_P(n) ((struct varlena *) PG_GETARG_POINTER(n)) /* use this if you want the input datum de-toasted: */ @@ -367,6 +368,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum); #define PG_RETURN_FLOAT8(x) return Float8GetDatum(x) #define PG_RETURN_INT64(x) return Int64GetDatum(x) #define PG_RETURN_UINT64(x) return UInt64GetDatum(x) +#define PG_RETURN_TRANSACTIONID(x) return TransactionIdGetDatum(x) /* RETURN macros for other pass-by-ref types will typically look like this: */ #define PG_RETURN_BYTEA_P(x) PG_RETURN_POINTER(x) #define PG_RETURN_TEXT_P(x) PG_RETURN_POINTER(x) diff --git a/src/include/nodes/pg_list.h b/src/include/nodes/pg_list.h index dc991626ba..78d4d08c4a 100644 --- a/src/include/nodes/pg_list.h +++ b/src/include/nodes/pg_list.h @@ -44,6 +44,7 @@ typedef union ListCell { void *ptr_value; int int_value; + int64 int64_value; Oid oid_value; TransactionId xid_value; } ListCell; @@ -169,6 +170,7 @@ list_length(const List *l) */ #define lfirst(lc) ((lc)->ptr_value) #define lfirst_int(lc) ((lc)->int_value) +#define lfirst_int64(lc) ((lc)->int64_value) #define lfirst_oid(lc) ((lc)->oid_value) #define lfirst_xid(lc) ((lc)->xid_value) #define lfirst_node(type,lc) castNode(type, lfirst(lc)) @@ -195,6 +197,7 @@ list_length(const List *l) #define llast(l) lfirst(list_last_cell(l)) #define llast_int(l) lfirst_int(list_last_cell(l)) +#define llast_int64(l) lfirst_int64(list_last_cell(l)) #define llast_oid(l) lfirst_oid(list_last_cell(l)) #define llast_xid(l) lfirst_xid(list_last_cell(l)) #define llast_node(type,l) castNode(type, llast(l)) @@ -557,6 +560,7 @@ extern List *list_make5_impl(NodeTag t, ListCell datum1, ListCell datum2, extern pg_nodiscard List *lappend(List *list, void *datum); extern pg_nodiscard List *lappend_int(List *list, int datum); +extern pg_nodiscard List *lappend_int64(List *list, int64 datum); extern pg_nodiscard List *lappend_oid(List *list, Oid datum); extern pg_nodiscard List *lappend_xid(List *list, TransactionId datum); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c5a80b829e..c1b49b018d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -775,6 +775,9 @@ # endif #endif +/* Postgres Pro use 64bit xids */ +#undef XID_IS_64BIT + /* Size of a WAL file block. This need have no particular relation to BLCKSZ. XLOG_BLCKSZ must be a power of 2, and if your system supports O_DIRECT I/O, XLOG_BLCKSZ must be a multiple of the alignment requirement for direct-I/O diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h index 0625cac6b5..40e0b1bd78 100644 --- a/src/include/port/pg_lfind.h +++ b/src/include/port/pg_lfind.h @@ -81,35 +81,21 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem) } /* - * pg_lfind32 + * pg_lfind64 * * Return true if there is an element in 'base' that equals 'key', otherwise * return false. */ static inline bool -pg_lfind32(uint32 key, uint32 *base, uint32 nelem) +pg_lfind64(uint64 key, uint64 *base, uint32 nelem) { - uint32 i = 0; - -#ifndef USE_NO_SIMD - - /* - * For better instruction-level parallelism, each loop iteration operates - * on a block of four registers. Testing for SSE2 has showed this is ~40% - * faster than using a block of two registers. - */ - const Vector32 keys = vector32_broadcast(key); /* load copies of key */ - const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); - const uint32 nelem_per_iteration = 4 * nelem_per_vector; - - /* round down to multiple of elements per iteration */ - const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); - + uint32 i, + iterations; #if defined(USE_ASSERT_CHECKING) bool assert_result = false; /* pre-compute the result for assert checking */ - for (i = 0; i < nelem; i++) + for (i = 0; i < nelem; ++i) { if (key == base[i]) { @@ -119,62 +105,127 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem) } #endif - for (i = 0; i < tail_idx; i += nelem_per_iteration) +#define UNROLL_FACTOR 8 + StaticAssertStmt((UNROLL_FACTOR & (UNROLL_FACTOR - 1)) == 0, + "Loop unroll factor must be power of 2"); + iterations = nelem & ~(UNROLL_FACTOR - 1); + for (i = 0; i < iterations; i += UNROLL_FACTOR) { - Vector32 vals1, - vals2, - vals3, - vals4, - result1, - result2, - result3, - result4, - tmp1, - tmp2, - result; - - /* load the next block into 4 registers */ - vector32_load(&vals1, &base[i]); - vector32_load(&vals2, &base[i + nelem_per_vector]); - vector32_load(&vals3, &base[i + nelem_per_vector * 2]); - vector32_load(&vals4, &base[i + nelem_per_vector * 3]); - - /* compare each value to the key */ - result1 = vector32_eq(keys, vals1); - result2 = vector32_eq(keys, vals2); - result3 = vector32_eq(keys, vals3); - result4 = vector32_eq(keys, vals4); - - /* combine the results into a single variable */ - tmp1 = vector32_or(result1, result2); - tmp2 = vector32_or(result3, result4); - result = vector32_or(tmp1, tmp2); - - /* see if there was a match */ - if (vector32_is_highbit_set(result)) + if (base[0] == key || base[1] == key || base[2] == key || + base[3] == key || base[4] == key || base[5] == key || + base[6] == key || base[7] == key) { +#if defined(USE_ASSERT_CHECKING) Assert(assert_result == true); +#endif return true; } + base += UNROLL_FACTOR; } -#endif /* ! USE_NO_SIMD */ /* Process the remaining elements one at a time. */ - for (; i < nelem; i++) + iterations = nelem & (UNROLL_FACTOR - 1); + for (i = 0; i < iterations; ++i) { - if (key == base[i]) + if (key == *base++) { -#ifndef USE_NO_SIMD +#if defined(USE_ASSERT_CHECKING) Assert(assert_result == true); #endif return true; } } -#ifndef USE_NO_SIMD +#if defined(USE_ASSERT_CHECKING) Assert(assert_result == false); #endif return false; +// uint32 i = 0; +// +//#ifndef USE_NO_SIMD +// +// /* +// * For better instruction-level parallelism, each loop iteration operates +// * on a block of four registers. Testing for SSE2 has showed this is ~40% +// * faster than using a block of two registers. +// */ +// const Vector32 keys = vector32_broadcast(key); /* load copies of key */ +// const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); +// const uint32 nelem_per_iteration = 4 * nelem_per_vector; +// +// /* round down to multiple of elements per iteration */ +// const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); +// +//#if defined(USE_ASSERT_CHECKING) +// bool assert_result = false; +// +// /* pre-compute the result for assert checking */ +// for (i = 0; i < nelem; i++) +// { +// if (key == base[i]) +// { +// assert_result = true; +// break; +// } +// } +//#endif +// +// for (i = 0; i < tail_idx; i += nelem_per_iteration) +// { +// Vector32 vals1, +// vals2, +// vals3, +// vals4, +// result1, +// result2, +// result3, +// result4, +// tmp1, +// tmp2, +// result; +// +// /* load the next block into 4 registers */ +// vector32_load(&vals1, &base[i]); +// vector32_load(&vals2, &base[i + nelem_per_vector]); +// vector32_load(&vals3, &base[i + nelem_per_vector * 2]); +// vector32_load(&vals4, &base[i + nelem_per_vector * 3]); +// +// /* compare each value to the key */ +// result1 = vector32_eq(keys, vals1); +// result2 = vector32_eq(keys, vals2); +// result3 = vector32_eq(keys, vals3); +// result4 = vector32_eq(keys, vals4); +// +// /* combine the results into a single variable */ +// tmp1 = vector32_or(result1, result2); +// tmp2 = vector32_or(result3, result4); +// result = vector32_or(tmp1, tmp2); +// +// /* see if there was a match */ +// if (vector32_is_highbit_set(result)) +// { +// Assert(assert_result == true); +// return true; +// } +// } +//#endif /* ! USE_NO_SIMD */ +// +// /* Process the remaining elements one at a time. */ +// for (; i < nelem; i++) +// { +// if (key == base[i]) +// { +//#ifndef USE_NO_SIMD +// Assert(assert_result == true); +//#endif +// return true; +// } +// } +// +//#ifndef USE_NO_SIMD +// Assert(assert_result == false); +//#endif +// return false; } #endif /* PG_LFIND_H */ diff --git a/src/include/postgres.h b/src/include/postgres.h index 5f6a1e3d5a..cf46515829 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -428,6 +428,9 @@ typedef struct NullableDatum #define SIZEOF_DATUM SIZEOF_VOID_P +static uint64 DatumGetUInt64(Datum X); +static Datum UInt64GetDatum(uint64 X); + /* * DatumGetBool * Returns boolean value of a datum. @@ -609,7 +612,7 @@ ObjectIdGetDatum(Oid X) static inline TransactionId DatumGetTransactionId(Datum X) { - return (TransactionId) X; + return DatumGetUInt64(X); } /* @@ -619,7 +622,7 @@ DatumGetTransactionId(Datum X) static inline Datum TransactionIdGetDatum(TransactionId X) { - return (Datum) X; + return UInt64GetDatum(X); } /* @@ -629,7 +632,7 @@ TransactionIdGetDatum(TransactionId X) static inline Datum MultiXactIdGetDatum(MultiXactId X) { - return (Datum) X; + return UInt64GetDatum(X); } /* diff --git a/src/include/postmaster/autovacuum.h b/src/include/postmaster/autovacuum.h index 9d40fd6d54..03024361ea 100644 --- a/src/include/postmaster/autovacuum.h +++ b/src/include/postmaster/autovacuum.h @@ -37,8 +37,8 @@ extern PGDLLIMPORT int autovacuum_vac_ins_thresh; extern PGDLLIMPORT double autovacuum_vac_ins_scale; extern PGDLLIMPORT int autovacuum_anl_thresh; extern PGDLLIMPORT double autovacuum_anl_scale; -extern PGDLLIMPORT int autovacuum_freeze_max_age; -extern PGDLLIMPORT int autovacuum_multixact_freeze_max_age; +extern PGDLLIMPORT int64 autovacuum_freeze_max_age; +extern PGDLLIMPORT int64 autovacuum_multixact_freeze_max_age; extern PGDLLIMPORT double autovacuum_vac_cost_delay; extern PGDLLIMPORT int autovacuum_vac_cost_limit; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 406db6be78..df62ffa605 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -40,10 +40,10 @@ */ #define BUF_REFCOUNT_ONE 1 #define BUF_REFCOUNT_MASK ((1U << 18) - 1) -#define BUF_USAGECOUNT_MASK 0x003C0000U +#define BUF_USAGECOUNT_MASK 0x001C0000U #define BUF_USAGECOUNT_ONE (1U << 18) #define BUF_USAGECOUNT_SHIFT 18 -#define BUF_FLAG_MASK 0xFFC00000U +#define BUF_FLAG_MASK 0xFFE00000U /* Get refcount and usagecount from buffer state */ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) @@ -55,6 +55,7 @@ * Note: BM_TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ +#define BM_CONVERTED (1U << 21) /* buffer were converted to 64xid */ #define BM_LOCKED (1U << 22) /* buffer header is locked */ #define BM_DIRTY (1U << 23) /* data needs writing */ #define BM_VALID (1U << 24) /* data is valid */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 6f4dfa0960..fc9d1af846 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -157,8 +157,12 @@ extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum); extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); +extern void MarkBufferConverted(Buffer buffer, bool converted); +extern bool IsBufferConverted(Buffer buffer); extern void UnlockBuffers(void); +extern bool IsBufferLocked(Buffer buffer); +extern bool IsBufferLockedExclusive(Buffer buffer); extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); @@ -184,6 +188,8 @@ extern void AtProcExit_LocalBuffers(void); extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); +/* old tuple format support */ +extern void convert_page(Relation rel, Page orig_page, Buffer buf, BlockNumber blkno); /* inline functions */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 2708c4b683..87375bbf79 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -14,10 +14,13 @@ #ifndef BUFPAGE_H #define BUFPAGE_H +#include "access/transam.h" #include "access/xlogdefs.h" #include "storage/block.h" #include "storage/item.h" #include "storage/off.h" +#include "postgres.h" +#include "utils/rel.h" /* * A postgres disk page is an abstraction layered on top of a postgres @@ -163,12 +166,41 @@ typedef struct PageHeaderData LocationIndex pd_upper; /* offset to end of free space */ LocationIndex pd_special; /* offset to start of special space */ uint16 pd_pagesize_version; - TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ + ShortTransactionId pd_prune_xid; /* oldest prunable XID, or zero if + * none */ ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */ } PageHeaderData; typedef PageHeaderData *PageHeader; + +/* + * HeapPageSpecialData -- data that stored at the end of each heap page. + * + * pd_xid_base - base value for transaction IDs on page + * pd_multi_base - base value for multixact IDs on page + * + * pd_xid_base and pd_multi_base are base values for calculation of transaction + * identifiers from t_xmin and t_xmax in each heap tuple header on the page. + */ +typedef struct HeapPageSpecialData +{ + TransactionId pd_xid_base; /* base value for transaction IDs on page */ + TransactionId pd_multi_base; /* base value for multixact IDs on page */ +} HeapPageSpecialData; + +typedef HeapPageSpecialData *HeapPageSpecial; + +typedef struct ToastPageSpecialData +{ + TransactionId pd_xid_base; /* base value for transaction IDs on page */ +} ToastPageSpecialData; + +typedef ToastPageSpecialData *ToastPageSpecial; + +extern PGDLLIMPORT HeapPageSpecial heapDoubleXmaxSpecial; +extern PGDLLIMPORT ToastPageSpecial toastDoubleXmaxSpecial; + /* * pd_flags contains the following flag bits. Undefined bits are initialized * to zero and may be used in the future. @@ -200,7 +232,7 @@ typedef PageHeaderData *PageHeader; * As of Release 9.3, the checksum version must also be considered when * handling pages. */ -#define PG_PAGE_LAYOUT_VERSION 4 +#define PG_PAGE_LAYOUT_VERSION 5 #define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- @@ -440,18 +472,177 @@ PageClearAllVisible(Page page) } /* - * These two require "access/transam.h", so left as macros. + * Check if page is in "double xmax" format. */ -#define PageSetPrunable(page, xid) \ -do { \ - Assert(TransactionIdIsNormal(xid)); \ - if (!TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) || \ - TransactionIdPrecedes(xid, ((PageHeader) (page))->pd_prune_xid)) \ - ((PageHeader) (page))->pd_prune_xid = (xid); \ -} while (0) -#define PageClearPrunable(page) \ - (((PageHeader) (page))->pd_prune_xid = InvalidTransactionId) +static inline bool +HeapPageIsDoubleXmax(Page page) +{ + return ((PageHeader) (page))->pd_special == BLCKSZ; +} +/* + * Get pointer to HeapPageSpecialData. + * + * Can be used for non-consistent reads from non-locked pages. + * + * Return doubleXmaxSpecial when pd_special == BLCKSZ (i.e. "double xmax" + * format). + */ +static inline HeapPageSpecial +HeapPageGetSpecialNoAssert(Page page) +{ + if (HeapPageIsDoubleXmax(page)) + return heapDoubleXmaxSpecial; + + return (HeapPageSpecial) ((char *) page + + ((PageHeader) page)->pd_special); +} + +/* + * Get pointer to ToastPageSpecialData. + * + * Can be used for non-consistent reads from non-locked pages. + * + * Return doubleXmaxSpecial when pd_special == BLCKSZ (i.e. "double xmax" + * format). + */ +static inline ToastPageSpecial +ToastPageGetSpecialNoAssert(Page page) +{ + if (HeapPageIsDoubleXmax(page)) + return toastDoubleXmaxSpecial; + + return (ToastPageSpecial) ((char *) page + + ((PageHeader) page)->pd_special); +} + +/* + * Wrapper for HeapPageGetSpecialNoAssert for general use. + */ +static inline HeapPageSpecial +HeapPageGetSpecial(Page page) +{ + if (HeapPageIsDoubleXmax(page)) + return heapDoubleXmaxSpecial; + + Assert(((PageHeader) page)->pd_special == + BLCKSZ - MAXALIGN(sizeof(HeapPageSpecialData))); + + return (HeapPageSpecial) ((char *) page + + ((PageHeader) page)->pd_special); +} + +/* + * Wrapper for ToastPageGetSpecialNoAssert for general use. + */ +static inline ToastPageSpecial +ToastPageGetSpecial(Page page) +{ + if (HeapPageIsDoubleXmax(page)) + return toastDoubleXmaxSpecial; + + Assert(((PageHeader) page)->pd_special == + BLCKSZ - MAXALIGN(sizeof(ToastPageSpecialData))); + + return (ToastPageSpecial) ((char *) page + + ((PageHeader) page)->pd_special); +} + +/* + * Set pd_prune_xid. + */ +static inline void +HeapPageSetPruneXid(Page page, TransactionId xid, bool is_toast) +{ + TransactionId base; + + if (HeapPageIsDoubleXmax(page)) + return; + + if (!TransactionIdIsNormal(xid)) + { + ((PageHeader) (page))->pd_prune_xid = xid; + return; + } + + base = is_toast ? ToastPageGetSpecial(page)->pd_xid_base : + HeapPageGetSpecial(page)->pd_xid_base; + + ((PageHeader) (page))->pd_prune_xid = NormalTransactionIdToShort(base, xid); + Assert(((PageHeader) (page))->pd_prune_xid <= MaxShortTransactionId); +} + +static inline void +ToastPageSetPruneXid(Page page, TransactionId xid) +{ + if (HeapPageIsDoubleXmax(page)) + return; + + if (!TransactionIdIsNormal(xid)) + { + ((PageHeader) (page))->pd_prune_xid = xid; + return; + } + + ((PageHeader) (page))->pd_prune_xid = + NormalTransactionIdToShort(ToastPageGetSpecial(page)->pd_xid_base, (xid)); + + Assert(((PageHeader) (page))->pd_prune_xid <= MaxShortTransactionId); +} + +/* + * Get pd_prune_xid from locked page. + */ +static inline TransactionId +HeapPageGetPruneXid(Page page, bool is_toast) +{ + TransactionId base; + + if (HeapPageIsDoubleXmax(page)) + return ((PageHeader) (page))->pd_prune_xid; + + base = is_toast ? ToastPageGetSpecial(page)->pd_xid_base : + HeapPageGetSpecial(page)->pd_xid_base; + + return ShortTransactionIdToNormal(base, + ((PageHeader) (page))->pd_prune_xid); +} + +static inline void +PageSetPrunable(Page page, TransactionId xid, bool is_toast) +{ + TransactionId prune_xid; + + Assert(TransactionIdIsNormal(xid)); + + if (HeapPageIsDoubleXmax(page)) + return; + + prune_xid = HeapPageGetPruneXid(page, is_toast); + if ((!TransactionIdIsValid(prune_xid) || + TransactionIdPrecedes(xid, prune_xid))) + { + HeapPageSetPruneXid(page, xid, is_toast); + } +} + +/* + * Get pd_prune_xid from non-locked page. May return invalid value, but doen't + * causes assert failures. + */ +static inline TransactionId +HeapPageGetPruneXidNoAssert(Page page, bool is_toast) +{ + TransactionId base; + + if (HeapPageIsDoubleXmax(page)) + return ((PageHeader) (page))->pd_prune_xid; + + base = is_toast ? ToastPageGetSpecialNoAssert(page)->pd_xid_base : + HeapPageGetSpecialNoAssert(page)->pd_xid_base; + return ShortTransactionIdToNormal(base, + ((PageHeader) (page))->pd_prune_xid); +} /* ---------------------------------------------------------------- * extern declarations @@ -485,6 +676,21 @@ do { \ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)), "BLCKSZ has to be a multiple of sizeof(size_t)"); +/* + * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete + */ +typedef struct ItemIdCompactData +{ + uint16 offsetindex; /* linp array index */ + int16 itemoff; /* page offset of item data */ + uint16 alignedlen; /* MAXALIGN(item data len) */ +} ItemIdCompactData; + +typedef ItemIdCompactData *ItemIdCompact; +typedef RelationData *Relation; + +extern int itemoffcompare(const void *item1, const void *item2); + extern void PageInit(Page page, Size pageSize, Size specialSize); extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags); extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size, @@ -493,7 +699,7 @@ extern Page PageGetTempPage(Page page); extern Page PageGetTempPageCopy(Page page); extern Page PageGetTempPageCopySpecial(Page page); extern void PageRestoreTempPage(Page tempPage, Page oldPage); -extern void PageRepairFragmentation(Page page); +extern void PageRepairFragmentation(Page page, bool is_toast); extern void PageTruncateLinePointerArray(Page page); extern Size PageGetFreeSpace(Page page); extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups); diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h index e33637ff21..442a72d658 100644 --- a/src/include/storage/itemid.h +++ b/src/include/storage/itemid.h @@ -78,6 +78,8 @@ typedef uint16 ItemLength; #define ItemIdGetRedirect(itemId) \ ((itemId)->lp_off) +#define ItemIdGetTupleEnd(itemId) \ + (MAXALIGN(ItemIdGetLength((itemId))) + ItemIdGetOffset((itemId))) /* * ItemIdIsValid * True iff item identifier is valid. diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index e4e1495b24..0e0cd79bb0 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -227,8 +227,8 @@ typedef struct LOCKTAG /* ID info for a transaction is its TransactionId */ #define SET_LOCKTAG_TRANSACTION(locktag,xid) \ - ((locktag).locktag_field1 = (xid), \ - (locktag).locktag_field2 = 0, \ + ((locktag).locktag_field1 = (uint32)((xid) & 0xFFFFFFFF), \ + (locktag).locktag_field2 = (uint32)((xid) >> 32), \ (locktag).locktag_field3 = 0, \ (locktag).locktag_field4 = 0, \ (locktag).locktag_type = LOCKTAG_TRANSACTION, \ @@ -237,8 +237,8 @@ typedef struct LOCKTAG /* ID info for a virtual transaction is its VirtualTransactionId */ #define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \ ((locktag).locktag_field1 = (vxid).backendId, \ - (locktag).locktag_field2 = (vxid).localTransactionId, \ - (locktag).locktag_field3 = 0, \ + (locktag).locktag_field2 = (uint32)((vxid).localTransactionId & 0xFFFFFFFF), \ + (locktag).locktag_field3 = (uint32)((vxid).localTransactionId >> 32), \ (locktag).locktag_field4 = 0, \ (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) @@ -248,9 +248,9 @@ typedef struct LOCKTAG * its speculative insert counter. */ #define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag,xid,token) \ - ((locktag).locktag_field1 = (xid), \ - (locktag).locktag_field2 = (token), \ - (locktag).locktag_field3 = 0, \ + ((locktag).locktag_field1 = (uint32)((xid) & 0xFFFFFFFF), \ + (locktag).locktag_field2 = (uint32)((xid) >> 32), \ + (locktag).locktag_field3 = (token), \ (locktag).locktag_field4 = 0, \ (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index fdc18f8180..4ff7353e8c 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -17,6 +17,7 @@ #include "access/clog.h" #include "access/xlogdefs.h" #include "lib/ilist.h" +#include "port/atomics.h" #include "storage/latch.h" #include "storage/lock.h" #include "storage/pg_sema.h" @@ -176,12 +177,12 @@ struct PGPROC Latch procLatch; /* generic latch for process */ - TransactionId xid; /* id of top-level transaction currently being + pg_atomic_uint64 xid; /* id of top-level transaction currently being * executed by this proc, if running and XID * is assigned; else InvalidTransactionId. * mirrored in ProcGlobal->xids[pgxactoff] */ - TransactionId xmin; /* minimal running XID as it was when we were + pg_atomic_uint64 xmin; /* minimal running XID as it was when we were * starting our xact, excluding LAZY VACUUM: * vacuum must not remove tuples deleted by * xid >= xmin ! */ @@ -364,7 +365,7 @@ typedef struct PROC_HDR PGPROC *allProcs; /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ - TransactionId *xids; + pg_atomic_uint64 *xids; /* * Array mirroring PGPROC.subxidStatus for each PGPROC currently in the diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index f5da98dc73..209615f3cc 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -21,7 +21,7 @@ #include "storage/standbydefs.h" /* User-settable GUC parameters */ -extern PGDLLIMPORT int vacuum_defer_cleanup_age; +extern PGDLLIMPORT int64 vacuum_defer_cleanup_age; extern PGDLLIMPORT int max_standby_archive_delay; extern PGDLLIMPORT int max_standby_streaming_delay; extern PGDLLIMPORT bool log_recovery_conflict_waits; diff --git a/src/include/utils/combocid.h b/src/include/utils/combocid.h index 80fe6d2cea..8465768b6f 100644 --- a/src/include/utils/combocid.h +++ b/src/include/utils/combocid.h @@ -15,7 +15,7 @@ #define COMBOCID_H /* - * HeapTupleHeaderGetCmin and HeapTupleHeaderGetCmax function prototypes + * HeapTupleGetCmin and HeapTupleGetCmax function prototypes * are in access/htup.h, because that's where the macro definitions that * those functions replaced used to be. */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 7dc401cf0d..3e1e5a5e4a 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -308,12 +308,12 @@ typedef struct AutoVacOpts int vacuum_ins_threshold; int analyze_threshold; int vacuum_cost_limit; - int freeze_min_age; - int freeze_max_age; - int freeze_table_age; - int multixact_freeze_min_age; - int multixact_freeze_max_age; - int multixact_freeze_table_age; + int64 freeze_min_age; + int64 freeze_max_age; + int64 freeze_table_age; + int64 multixact_freeze_min_age; + int64 multixact_freeze_max_age; + int64 multixact_freeze_table_age; int log_min_duration; float8 vacuum_cost_delay; float8 vacuum_scale_factor; diff --git a/src/include/utils/xid8.h b/src/include/utils/xid8.h index 9c5ce241db..1fdd1e86c1 100644 --- a/src/include/utils/xid8.h +++ b/src/include/utils/xid8.h @@ -17,13 +17,13 @@ static inline FullTransactionId DatumGetFullTransactionId(Datum X) { - return FullTransactionIdFromU64(DatumGetUInt64(X)); + return FullTransactionIdFromXid(DatumGetUInt64(X)); } static inline Datum FullTransactionIdGetDatum(FullTransactionId X) { - return UInt64GetDatum(U64FromFullTransactionId(X)); + return UInt64GetDatum(XidFromFullTransactionId(X)); } #define PG_GETARG_FULLTRANSACTIONID(X) DatumGetFullTransactionId(PG_GETARG_DATUM(X)) diff --git a/src/pl/plperl/plperl.c b/src/pl/plperl/plperl.c index 8f21e0d701..6772f8c360 100644 --- a/src/pl/plperl/plperl.c +++ b/src/pl/plperl/plperl.c @@ -2664,7 +2664,7 @@ validate_plperl_function(plperl_proc_ptr *proc_ptr, HeapTuple procTup) * This is needed because CREATE OR REPLACE FUNCTION can modify the * function's pg_proc entry without changing its OID. ************************************************************/ - uptodate = (prodesc->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && + uptodate = (prodesc->fn_xmin == HeapTupleGetRawXmin(procTup) && ItemPointerEquals(&prodesc->fn_tid, &procTup->t_self)); if (uptodate) @@ -2788,7 +2788,7 @@ compile_plperl_function(Oid fn_oid, bool is_trigger, bool is_event_trigger) MemoryContextSetIdentifier(proc_cxt, prodesc->proname); prodesc->fn_cxt = proc_cxt; prodesc->fn_refcount = 0; - prodesc->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + prodesc->fn_xmin = HeapTupleGetRawXmin(procTup); prodesc->fn_tid = procTup->t_self; prodesc->nargs = procStruct->pronargs; prodesc->arg_out_func = (FmgrInfo *) palloc0(prodesc->nargs * sizeof(FmgrInfo)); diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c index b286f2a50c..61db613c2f 100644 --- a/src/pl/plpgsql/src/pl_comp.c +++ b/src/pl/plpgsql/src/pl_comp.c @@ -171,7 +171,7 @@ recheck: if (function) { /* We have a compiled function, but is it still valid? */ - if (function->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && + if (function->fn_xmin == HeapTupleGetRawXmin(procTup) && ItemPointerEquals(&function->fn_tid, &procTup->t_self)) function_valid = true; else @@ -348,7 +348,7 @@ do_compile(FunctionCallInfo fcinfo, function->fn_signature = format_procedure(fcinfo->flinfo->fn_oid); MemoryContextSetIdentifier(func_cxt, function->fn_signature); function->fn_oid = fcinfo->flinfo->fn_oid; - function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + function->fn_xmin = HeapTupleGetRawXmin(procTup); function->fn_tid = procTup->t_self; function->fn_input_collation = fcinfo->fncollation; function->fn_cxt = func_cxt; diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index a647342948..6763b6a4b7 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -7376,6 +7376,7 @@ deconstruct_composite_datum(Datum value, HeapTupleData *tmptup) tmptup->t_len = HeapTupleHeaderGetDatumLength(td); ItemPointerSetInvalid(&(tmptup->t_self)); tmptup->t_tableOid = InvalidOid; + HeapTupleSetZeroBase(tmptup); tmptup->t_data = td; /* Extract rowtype info and find a tupdesc */ @@ -7550,6 +7551,7 @@ exec_move_row_from_datum(PLpgSQL_execstate *estate, tmptup.t_len = HeapTupleHeaderGetDatumLength(td); ItemPointerSetInvalid(&(tmptup.t_self)); tmptup.t_tableOid = InvalidOid; + HeapTupleSetZeroBase(&tmptup); tmptup.t_data = td; /* Extract rowtype info */ diff --git a/src/pl/plpython/plpy_procedure.c b/src/pl/plpython/plpy_procedure.c index 494f109b32..9884f74fa7 100644 --- a/src/pl/plpython/plpy_procedure.c +++ b/src/pl/plpython/plpy_procedure.c @@ -178,7 +178,7 @@ PLy_procedure_create(HeapTuple procTup, Oid fn_oid, bool is_trigger) proc->proname = pstrdup(NameStr(procStruct->proname)); MemoryContextSetIdentifier(cxt, proc->proname); proc->pyname = pstrdup(procName); - proc->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + proc->fn_xmin = HeapTupleGetRawXmin(procTup); proc->fn_tid = procTup->t_self; proc->fn_readonly = (procStruct->provolatile != PROVOLATILE_VOLATILE); proc->is_setof = procStruct->proretset; @@ -419,7 +419,7 @@ PLy_procedure_valid(PLyProcedure *proc, HeapTuple procTup) return false; /* If the pg_proc tuple has changed, it's not valid */ - if (!(proc->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && + if (!(proc->fn_xmin == HeapTupleGetRawXmin(procTup) && ItemPointerEquals(&proc->fn_tid, &procTup->t_self))) return false; diff --git a/src/pl/tcl/pltcl.c b/src/pl/tcl/pltcl.c index eaa98d42c2..bd5b071c13 100644 --- a/src/pl/tcl/pltcl.c +++ b/src/pl/tcl/pltcl.c @@ -1428,7 +1428,7 @@ compile_pltcl_function(Oid fn_oid, Oid tgreloid, * function's pg_proc entry without changing its OID. ************************************************************/ if (prodesc != NULL && - prodesc->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && + prodesc->fn_xmin == HeapTupleGetRawXmin(procTup) && ItemPointerEquals(&prodesc->fn_tid, &procTup->t_self)) { /* It's still up-to-date, so we can use it */ @@ -1495,7 +1495,7 @@ compile_pltcl_function(Oid fn_oid, Oid tgreloid, prodesc->internal_proname = pstrdup(internal_proname); prodesc->fn_cxt = proc_cxt; prodesc->fn_refcount = 0; - prodesc->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + prodesc->fn_xmin = HeapTupleGetRawXmin(procTup); prodesc->fn_tid = procTup->t_self; prodesc->nargs = procStruct->pronargs; prodesc->arg_out_func = (FmgrInfo *) palloc0(prodesc->nargs * sizeof(FmgrInfo)); diff --git a/src/test/Makefile b/src/test/Makefile index dbd3192874..8e0f39289e 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,8 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl regress isolation modules authentication recovery subscription +SUBDIRS = perl regress isolation modules authentication recovery subscription \ + xid-64 ifeq ($(with_icu),yes) SUBDIRS += icu diff --git a/src/test/meson.build b/src/test/meson.build index 241d9d48aa..650936bd66 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -5,6 +5,7 @@ subdir('authentication') subdir('recovery') subdir('subscription') subdir('modules') +subdir('xid-64') if ssl.found() subdir('ssl') diff --git a/src/test/modules/test_lfind/test_lfind.c b/src/test/modules/test_lfind/test_lfind.c index 82673d54c6..e4fd52c134 100644 --- a/src/test/modules/test_lfind/test_lfind.c +++ b/src/test/modules/test_lfind/test_lfind.c @@ -120,29 +120,29 @@ Datum test_lfind32(PG_FUNCTION_ARGS) { #define TEST_ARRAY_SIZE 135 - uint32 test_array[TEST_ARRAY_SIZE] = {0}; + uint64 test_array[TEST_ARRAY_SIZE] = {0}; test_array[8] = 1; test_array[64] = 2; test_array[TEST_ARRAY_SIZE - 1] = 3; - if (pg_lfind32(1, test_array, 4)) - elog(ERROR, "pg_lfind32() found nonexistent element"); - if (!pg_lfind32(1, test_array, TEST_ARRAY_SIZE)) - elog(ERROR, "pg_lfind32() did not find existing element"); + if (pg_lfind64(1, test_array, 4)) + elog(ERROR, "pg_lfind64() found nonexistent element"); + if (!pg_lfind64(1, test_array, TEST_ARRAY_SIZE)) + elog(ERROR, "pg_lfind64() did not find existing element"); - if (pg_lfind32(2, test_array, 32)) - elog(ERROR, "pg_lfind32() found nonexistent element"); - if (!pg_lfind32(2, test_array, TEST_ARRAY_SIZE)) - elog(ERROR, "pg_lfind32() did not find existing element"); + if (pg_lfind64(2, test_array, 32)) + elog(ERROR, "pg_lfind64() found nonexistent element"); + if (!pg_lfind64(2, test_array, TEST_ARRAY_SIZE)) + elog(ERROR, "pg_lfind64() did not find existing element"); - if (pg_lfind32(3, test_array, 96)) - elog(ERROR, "pg_lfind32() found nonexistent element"); - if (!pg_lfind32(3, test_array, TEST_ARRAY_SIZE)) - elog(ERROR, "pg_lfind32() did not find existing element"); + if (pg_lfind64(3, test_array, 96)) + elog(ERROR, "pg_lfind64() found nonexistent element"); + if (!pg_lfind64(3, test_array, TEST_ARRAY_SIZE)) + elog(ERROR, "pg_lfind64() did not find existing element"); - if (pg_lfind32(4, test_array, TEST_ARRAY_SIZE)) - elog(ERROR, "pg_lfind32() found nonexistent element"); + if (pg_lfind64(4, test_array, TEST_ARRAY_SIZE)) + elog(ERROR, "pg_lfind64() found nonexistent element"); PG_RETURN_VOID(); } diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 4fef9c12e6..cb535c0906 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -476,7 +476,9 @@ sub init mkdir $self->archive_dir; PostgreSQL::Test::Utils::system_or_bail('initdb', '-D', $pgdata, '-A', - 'trust', '-N', @{ $params{extra} }); + 'trust', '-N', + '-x', '1249835483136', '-m', '2422361554944', '-o', '3594887626752', + @{ $params{extra} }); PostgreSQL::Test::Utils::system_or_bail($ENV{PG_REGRESS}, '--config-auth', $pgdata, @{ $params{auth_extra} }); diff --git a/src/test/recovery/t/003_recovery_targets.pl b/src/test/recovery/t/003_recovery_targets.pl index e8e1a420bc..8329d2ff7e 100644 --- a/src/test/recovery/t/003_recovery_targets.pl +++ b/src/test/recovery/t/003_recovery_targets.pl @@ -57,7 +57,7 @@ $node_primary->init(has_archiving => 1, allows_streaming => 1); # Bump the transaction ID epoch. This is useful to stress the portability # of recovery_target_xid parsing. -system_or_bail('pg_resetwal', '--epoch', '1', $node_primary->data_dir); +system_or_bail('pg_resetwal', $node_primary->data_dir); # Start it $node_primary->start; diff --git a/src/test/regress/expected/indirect_toast.out b/src/test/regress/expected/indirect_toast.out index 44b54dc37f..313482b866 100644 --- a/src/test/regress/expected/indirect_toast.out +++ b/src/test/regress/expected/indirect_toast.out @@ -161,6 +161,14 @@ SELECT substring(indtoasttest::text, 1, 200) FROM indtoasttest; ("one-toasted,one-null, via indirect",0,1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 (5 rows) +create or replace function random_string(len integer) returns text as $$ +select substr((select string_agg(r,'') from (select random()::text as r from generate_series(1,(len+15)/16)) s1), 1, len); +$$ language sql; +create table toasttest_main(t text); +alter table toasttest_main alter column t set storage main; +insert into toasttest_main (select random_string(len) from generate_series(7000,8000) len); DROP TABLE indtoasttest; +DROP TABLE toasttest_main; DROP FUNCTION update_using_indirect(); +DROP FUNCTION random_string(integer); RESET default_toast_compression; diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index dd4354fc7d..d52545b443 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -100,7 +100,7 @@ SELECT pg_size_pretty(pg_relation_size('large_tuple_test'::regclass, 'main')); INSERT INTO large_tuple_test (select 3, NULL); -- now this tuple won't fit on the second page, but the insert should -- still succeed by extending the relation -INSERT INTO large_tuple_test (select 4, repeat('a', 8126)); +INSERT INTO large_tuple_test (select 4, repeat('a', 8112)); DROP TABLE large_tuple_test; -- -- check indirection (field/array assignment), cf bug #14265 @@ -980,3 +980,17 @@ insert into returningwrtest values (2, 'foo') returning returningwrtest; (1 row) drop table returningwrtest; +-- Check for MaxHeapTupleSize +create table maxheaptuplesize_test(value text); +alter table maxheaptuplesize_test alter column value set storage external; +insert into maxheaptuplesize_test values (repeat('x', 8104)); +insert into maxheaptuplesize_test values (repeat('x', 8112)); +insert into maxheaptuplesize_test values (repeat('x', 8120)); +insert into maxheaptuplesize_test values (repeat('x', 8128)); +insert into maxheaptuplesize_test values (repeat('x', 8136)); +insert into maxheaptuplesize_test values (repeat('x', 8144)); +insert into maxheaptuplesize_test values (repeat('x', 8152)); +insert into maxheaptuplesize_test values (repeat('x', 8160)); +insert into maxheaptuplesize_test values (repeat('x', 8168)); +insert into maxheaptuplesize_test values (repeat('x', 8176)); +drop table maxheaptuplesize_test; diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 330eb0f765..ce4a2ab432 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -197,7 +197,7 @@ WHERE p1.oid != p2.oid AND ORDER BY 1, 2; proargtypes | proargtypes -----------------------------+-------------------------- - integer | xid + bigint | xid timestamp without time zone | timestamp with time zone bit | bit varying txid_snapshot | pg_snapshot @@ -705,7 +705,7 @@ int8(oid) tideq(tid,tid) timestamptz_cmp(timestamp with time zone,timestamp with time zone) interval_cmp(interval,interval) -xideqint4(xid,integer) +xideqint8(xid,bigint) timetz_eq(time with time zone,time with time zone) timetz_ne(time with time zone,time with time zone) timetz_lt(time with time zone,time with time zone) @@ -819,7 +819,7 @@ pg_lsn_gt(pg_lsn,pg_lsn) pg_lsn_ne(pg_lsn,pg_lsn) pg_lsn_cmp(pg_lsn,pg_lsn) xidneq(xid,xid) -xidneqint4(xid,integer) +xidneqint8(xid,bigint) sha224(bytea) sha256(bytea) sha384(bytea) diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out index 1aeed8452b..d3be84754c 100644 --- a/src/test/regress/expected/select_views.out +++ b/src/test/regress/expected/select_views.out @@ -2,9 +2,22 @@ -- SELECT_VIEWS -- test the views defined in CREATE_VIEWS -- -SELECT * FROM street; +SELECT * FROM street ORDER BY name COLLATE "C", thepath::text COLLATE "C"; name | thepath | cname ------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------- + 100th Ave | [(-122.1657,37.429),(-122.1647,37.432)] | Oakland + 107th Ave | [(-122.1555,37.403),(-122.1531,37.41)] | Oakland + 14th St | [(-122.299,37.147),(-122.3,37.148)] | Lafayette + 19th Ave | [(-122.2366,37.897),(-122.2359,37.905)] | Berkeley + 1st St | [(-121.75508,37.89294),(-121.753581,37.90031)] | Oakland + 5th St | [(-122.278,37),(-122.2792,37.005),(-122.2803,37.009)] | Lafayette + 5th St | [(-122.296,37.615),(-122.2953,37.598)] | Berkeley + 82nd Ave | [(-122.1695,37.596),(-122.1681,37.603)] | Berkeley + 85th Ave | [(-122.1877,37.466),(-122.186,37.476)] | Oakland + 89th Ave | [(-122.1822,37.459),(-122.1803,37.471)] | Oakland + 98th Ave | [(-122.1568,37.498),(-122.1558,37.502)] | Oakland + 98th Ave | [(-122.1693,37.438),(-122.1682,37.444)] | Oakland + 98th Ave | [(-122.2001,37.258),(-122.1974,37.27)] | Lafayette Access Rd 25 | [(-121.9283,37.894),(-121.9283,37.9)] | Oakland Ada St | [(-122.2487,37.398),(-122.2496,37.401)] | Lafayette Agua Fria Creek | [(-121.9254,37.922),(-121.9281,37.889)] | Oakland @@ -22,8 +35,8 @@ SELECT * FROM street; Arroyo Las Positas | [(-121.7973,37.997),(-121.7957,37.005)] | Oakland Arroyo Seco | [(-121.7073,37.766),(-121.6997,37.729)] | Oakland Ash St | [(-122.0408,37.31),(-122.04,37.292)] | Oakland - Avenue 134th | [(-122.1823,37.002),(-122.1851,37.992)] | Oakland Avenue 134th | [(-122.1823,37.002),(-122.1851,37.992)] | Berkeley + Avenue 134th | [(-122.1823,37.002),(-122.1851,37.992)] | Oakland Avenue 140th | [(-122.1656,37.003),(-122.1691,37.988)] | Oakland Avenue 140th | [(-122.1656,37.003),(-122.1691,37.988)] | Berkeley Avenue D | [(-122.298,37.848),(-122.3024,37.849)] | Berkeley @@ -37,14 +50,14 @@ SELECT * FROM street; Broadmore Ave | [(-122.095,37.522),(-122.0936,37.497)] | Oakland Broadway | [(-122.2409,37.586),(-122.2395,37.601)] | Berkeley Buckingham Blvd | [(-122.2231,37.59),(-122.2214,37.606)] | Berkeley + Butterfield Dr | [(-122.0838,37.002),(-122.0834,37.987)] | Berkeley Butterfield Dr | [(-122.0838,37.002),(-122.0834,37.987)] | Oakland Butterfield Dr | [(-122.0838,37.002),(-122.0834,37.987)] | Oakland - Butterfield Dr | [(-122.0838,37.002),(-122.0834,37.987)] | Berkeley C St | [(-122.1768,37.46),(-122.1749,37.435)] | Oakland Calaveras Creek | [(-121.8203,37.035),(-121.8207,37.931)] | Oakland Calaveras Creek | [(-121.8203,37.035),(-121.8207,37.931)] | Oakland - California St | [(-122.2032,37.005),(-122.2016,37.996)] | Berkeley California St | [(-122.2032,37.005),(-122.2016,37.996)] | Lafayette + California St | [(-122.2032,37.005),(-122.2016,37.996)] | Berkeley Cameron Ave | [(-122.1316,37.502),(-122.1327,37.481)] | Oakland Campus Dr | [(-122.1704,37.905),(-122.1678,37.868),(-122.1671,37.865)] | Berkeley Capricorn Ave | [(-122.2176,37.404),(-122.2164,37.384)] | Lafayette @@ -55,8 +68,8 @@ SELECT * FROM street; Central Ave | [(-122.2343,37.602),(-122.2331,37.595)] | Berkeley Chambers Dr | [(-122.2004,37.352),(-122.1972,37.368)] | Lafayette Chambers Lane | [(-122.2001,37.359),(-122.1975,37.371)] | Lafayette - Champion St | [(-122.214,37.991),(-122.2147,37.002)] | Berkeley Champion St | [(-122.214,37.991),(-122.2147,37.002)] | Lafayette + Champion St | [(-122.214,37.991),(-122.2147,37.002)] | Berkeley Chapman Dr | [(-122.0421,37.504),(-122.0414,37.498)] | Oakland Charles St | [(-122.0255,37.505),(-122.0252,37.499)] | Oakland Cherry St | [(-122.0437,37.42),(-122.0434,37.413)] | Oakland @@ -77,9 +90,9 @@ SELECT * FROM street; Cull Canyon Road | [(-122.0536,37.435),(-122.0499,37.315)] | Oakland Cull Creek | [(-122.0624,37.875),(-122.0582,37.527)] | Berkeley D St | [(-122.1811,37.505),(-122.1805,37.497)] | Oakland + Decoto Road | [(-122.0159,37.006),(-122.016,37.002),(-122.0164,37.993)] | Berkeley Decoto Road | [(-122.0159,37.006),(-122.016,37.002),(-122.0164,37.993)] | Oakland Decoto Road | [(-122.0159,37.006),(-122.016,37.002),(-122.0164,37.993)] | Oakland - Decoto Road | [(-122.0159,37.006),(-122.016,37.002),(-122.0164,37.993)] | Berkeley Deering St | [(-122.2146,37.904),(-122.2126,37.897)] | Berkeley Dimond Ave | [(-122.2167,37.994),(-122.2162,37.006)] | Berkeley Dimond Ave | [(-122.2167,37.994),(-122.2162,37.006)] | Lafayette @@ -117,12 +130,12 @@ SELECT * FROM street; I- 580 | [(-121.9322,37.989),(-121.9243,37.006),(-121.9217,37.014)] | Oakland I- 580 | [(-122.018,37.019),(-122.0009,37.032),(-121.9787,37.983),(-121.958,37.984),(-121.9571,37.986)] | Oakland I- 580 | [(-122.018,37.019),(-122.0009,37.032),(-121.9787,37.983),(-121.958,37.984),(-121.9571,37.986)] | Oakland - I- 580 | [(-122.1108,37.023),(-122.1101,37.02),(-122.108103,37.00764),(-122.108,37.007),(-122.1069,37.998),(-122.1064,37.994),(-122.1053,37.982),(-122.1048,37.977),(-122.1032,37.958),(-122.1026,37.953),(-122.1013,37.938),(-122.0989,37.911),(-122.0984,37.91),(-122.098,37.908)] | Oakland I- 580 | [(-122.1108,37.023),(-122.1101,37.02),(-122.108103,37.00764),(-122.108,37.007),(-122.1069,37.998),(-122.1064,37.994),(-122.1053,37.982),(-122.1048,37.977),(-122.1032,37.958),(-122.1026,37.953),(-122.1013,37.938),(-122.0989,37.911),(-122.0984,37.91),(-122.098,37.908)] | Berkeley + I- 580 | [(-122.1108,37.023),(-122.1101,37.02),(-122.108103,37.00764),(-122.108,37.007),(-122.1069,37.998),(-122.1064,37.994),(-122.1053,37.982),(-122.1048,37.977),(-122.1032,37.958),(-122.1026,37.953),(-122.1013,37.938),(-122.0989,37.911),(-122.0984,37.91),(-122.098,37.908)] | Oakland I- 580 | [(-122.1543,37.703),(-122.1535,37.694),(-122.1512,37.655),(-122.1475,37.603),(-122.1468,37.583),(-122.1472,37.569),(-122.149044,37.54874),(-122.1493,37.546),(-122.1501,37.532),(-122.1506,37.509),(-122.1495,37.482),(-122.1487,37.467),(-122.1477,37.447),(-122.1414,37.383),(-122.1404,37.376),(-122.1398,37.372),(-122.139,37.356),(-122.1388,37.353),(-122.1385,37.34),(-122.1382,37.33),(-122.1378,37.316)] | Oakland I- 580 | [(-122.1543,37.703),(-122.1535,37.694),(-122.1512,37.655),(-122.1475,37.603),(-122.1468,37.583),(-122.1472,37.569),(-122.149044,37.54874),(-122.1493,37.546),(-122.1501,37.532),(-122.1506,37.509),(-122.1495,37.482),(-122.1487,37.467),(-122.1477,37.447),(-122.1414,37.383),(-122.1404,37.376),(-122.1398,37.372),(-122.139,37.356),(-122.1388,37.353),(-122.1385,37.34),(-122.1382,37.33),(-122.1378,37.316)] | Berkeley - I- 580 | [(-122.2197,37.99),(-122.22,37.99),(-122.222092,37.99523),(-122.2232,37.998),(-122.224146,37.99963),(-122.2261,37.003),(-122.2278,37.007),(-122.2302,37.026),(-122.2323,37.043),(-122.2344,37.059),(-122.235405,37.06427),(-122.2365,37.07)] | Berkeley I- 580 | [(-122.2197,37.99),(-122.22,37.99),(-122.222092,37.99523),(-122.2232,37.998),(-122.224146,37.99963),(-122.2261,37.003),(-122.2278,37.007),(-122.2302,37.026),(-122.2323,37.043),(-122.2344,37.059),(-122.235405,37.06427),(-122.2365,37.07)] | Lafayette + I- 580 | [(-122.2197,37.99),(-122.22,37.99),(-122.222092,37.99523),(-122.2232,37.998),(-122.224146,37.99963),(-122.2261,37.003),(-122.2278,37.007),(-122.2302,37.026),(-122.2323,37.043),(-122.2344,37.059),(-122.235405,37.06427),(-122.2365,37.07)] | Berkeley I- 580 Ramp | [(-121.8521,37.011),(-121.8479,37.999),(-121.8476,37.999),(-121.8456,37.01),(-121.8455,37.011)] | Oakland I- 580 Ramp | [(-121.8521,37.011),(-121.8479,37.999),(-121.8476,37.999),(-121.8456,37.01),(-121.8455,37.011)] | Oakland I- 580 Ramp | [(-121.8743,37.014),(-121.8722,37.999),(-121.8714,37.999)] | Oakland @@ -136,8 +149,8 @@ SELECT * FROM street; I- 580 Ramp | [(-122.0941,37.897),(-122.0943,37.902)] | Berkeley I- 580 Ramp | [(-122.096,37.888),(-122.0962,37.891),(-122.0964,37.9)] | Berkeley I- 580 Ramp | [(-122.101,37.898),(-122.1005,37.902),(-122.0989,37.911)] | Berkeley - I- 580 Ramp | [(-122.1086,37.003),(-122.1068,37.993),(-122.1066,37.992),(-122.1053,37.982)] | Oakland I- 580 Ramp | [(-122.1086,37.003),(-122.1068,37.993),(-122.1066,37.992),(-122.1053,37.982)] | Berkeley + I- 580 Ramp | [(-122.1086,37.003),(-122.1068,37.993),(-122.1066,37.992),(-122.1053,37.982)] | Oakland I- 580 Ramp | [(-122.1414,37.383),(-122.1407,37.376),(-122.1403,37.372),(-122.139,37.356)] | Oakland I- 580/I-680 Ramp | ((-121.9207,37.988),(-121.9192,37.016)) | Oakland I- 580/I-680 Ramp | ((-121.9207,37.988),(-121.9192,37.016)) | Oakland @@ -158,16 +171,16 @@ SELECT * FROM street; I- 880 | ((-121.9669,37.075),(-121.9663,37.071),(-121.9656,37.065),(-121.9618,37.037),(-121.95689,37),(-121.948,37.933)) | Oakland I- 880 | [(-121.948,37.933),(-121.9471,37.925),(-121.9467,37.923),(-121.946,37.918),(-121.9452,37.912),(-121.937,37.852)] | Oakland I- 880 | [(-122.0219,37.466),(-122.0205,37.447),(-122.020331,37.44447),(-122.020008,37.43962),(-122.0195,37.432),(-122.0193,37.429),(-122.0164,37.393),(-122.010219,37.34771),(-122.0041,37.313)] | Oakland - I- 880 | [(-122.0375,37.632),(-122.0359,37.619),(-122.0358,37.616),(-122.034514,37.60409),(-122.031876,37.57965),(-122.031193,37.57332),(-122.03016,37.56375),(-122.02943,37.55698),(-122.028689,37.54929),(-122.027833,37.53908),(-122.025979,37.51698),(-122.0238,37.491)] | Oakland I- 880 | [(-122.0375,37.632),(-122.0359,37.619),(-122.0358,37.616),(-122.034514,37.60409),(-122.031876,37.57965),(-122.031193,37.57332),(-122.03016,37.56375),(-122.02943,37.55698),(-122.028689,37.54929),(-122.027833,37.53908),(-122.025979,37.51698),(-122.0238,37.491)] | Berkeley - I- 880 | [(-122.0612,37.003),(-122.0604,37.991),(-122.0596,37.982),(-122.0585,37.967),(-122.0583,37.961),(-122.0553,37.918),(-122.053635,37.89475),(-122.050759,37.8546),(-122.05,37.844),(-122.0485,37.817),(-122.0483,37.813),(-122.0482,37.811)] | Oakland + I- 880 | [(-122.0375,37.632),(-122.0359,37.619),(-122.0358,37.616),(-122.034514,37.60409),(-122.031876,37.57965),(-122.031193,37.57332),(-122.03016,37.56375),(-122.02943,37.55698),(-122.028689,37.54929),(-122.027833,37.53908),(-122.025979,37.51698),(-122.0238,37.491)] | Oakland I- 880 | [(-122.0612,37.003),(-122.0604,37.991),(-122.0596,37.982),(-122.0585,37.967),(-122.0583,37.961),(-122.0553,37.918),(-122.053635,37.89475),(-122.050759,37.8546),(-122.05,37.844),(-122.0485,37.817),(-122.0483,37.813),(-122.0482,37.811)] | Oakland I- 880 | [(-122.0612,37.003),(-122.0604,37.991),(-122.0596,37.982),(-122.0585,37.967),(-122.0583,37.961),(-122.0553,37.918),(-122.053635,37.89475),(-122.050759,37.8546),(-122.05,37.844),(-122.0485,37.817),(-122.0483,37.813),(-122.0482,37.811)] | Berkeley + I- 880 | [(-122.0612,37.003),(-122.0604,37.991),(-122.0596,37.982),(-122.0585,37.967),(-122.0583,37.961),(-122.0553,37.918),(-122.053635,37.89475),(-122.050759,37.8546),(-122.05,37.844),(-122.0485,37.817),(-122.0483,37.813),(-122.0482,37.811)] | Oakland I- 880 | [(-122.0831,37.312),(-122.0819,37.296),(-122.081,37.285),(-122.0786,37.248),(-122.078,37.24),(-122.077642,37.23496),(-122.076983,37.22567),(-122.076599,37.22026),(-122.076229,37.21505),(-122.0758,37.209)] | Oakland I- 880 | [(-122.0978,37.528),(-122.096,37.496),(-122.0931,37.453),(-122.09277,37.4496),(-122.090189,37.41442),(-122.0896,37.405),(-122.085,37.34)] | Oakland I- 880 | [(-122.1365,37.902),(-122.1358,37.898),(-122.1333,37.881),(-122.1323,37.874),(-122.1311,37.866),(-122.1308,37.865),(-122.1307,37.864),(-122.1289,37.851),(-122.1277,37.843),(-122.1264,37.834),(-122.1231,37.812),(-122.1165,37.766),(-122.1104,37.72),(-122.109695,37.71094),(-122.109,37.702),(-122.108312,37.69168),(-122.1076,37.681)] | Berkeley - I- 880 | [(-122.1755,37.185),(-122.1747,37.178),(-122.1742,37.173),(-122.1692,37.126),(-122.167792,37.11594),(-122.16757,37.11435),(-122.1671,37.111),(-122.1655,37.1),(-122.165169,37.09811),(-122.1641,37.092),(-122.1596,37.061),(-122.158381,37.05275),(-122.155991,37.03657),(-122.1531,37.017),(-122.1478,37.98),(-122.1407,37.932),(-122.1394,37.924),(-122.1389,37.92),(-122.1376,37.91)] | Oakland I- 880 | [(-122.1755,37.185),(-122.1747,37.178),(-122.1742,37.173),(-122.1692,37.126),(-122.167792,37.11594),(-122.16757,37.11435),(-122.1671,37.111),(-122.1655,37.1),(-122.165169,37.09811),(-122.1641,37.092),(-122.1596,37.061),(-122.158381,37.05275),(-122.155991,37.03657),(-122.1531,37.017),(-122.1478,37.98),(-122.1407,37.932),(-122.1394,37.924),(-122.1389,37.92),(-122.1376,37.91)] | Berkeley + I- 880 | [(-122.1755,37.185),(-122.1747,37.178),(-122.1742,37.173),(-122.1692,37.126),(-122.167792,37.11594),(-122.16757,37.11435),(-122.1671,37.111),(-122.1655,37.1),(-122.165169,37.09811),(-122.1641,37.092),(-122.1596,37.061),(-122.158381,37.05275),(-122.155991,37.03657),(-122.1531,37.017),(-122.1478,37.98),(-122.1407,37.932),(-122.1394,37.924),(-122.1389,37.92),(-122.1376,37.91)] | Oakland I- 880 | [(-122.2214,37.711),(-122.2202,37.699),(-122.2199,37.695),(-122.219,37.682),(-122.2184,37.672),(-122.2173,37.652),(-122.2159,37.638),(-122.2144,37.616),(-122.2138,37.612),(-122.2135,37.609),(-122.212,37.592),(-122.2116,37.586),(-122.2111,37.581)] | Berkeley I- 880 | [(-122.2707,37.975),(-122.2693,37.972),(-122.2681,37.966),(-122.267,37.962),(-122.2659,37.957),(-122.2648,37.952),(-122.2636,37.946),(-122.2625,37.935),(-122.2617,37.927),(-122.2607,37.921),(-122.2593,37.916),(-122.258,37.911),(-122.2536,37.898),(-122.2432,37.858),(-122.2408,37.845),(-122.2386,37.827),(-122.2374,37.811)] | Berkeley I- 880 Ramp | [(-122.0019,37.301),(-122.002,37.293)] | Oakland @@ -202,28 +215,28 @@ SELECT * FROM street; Laguna Ave | [(-122.2099,37.989),(-122.2089,37)] | Berkeley Laguna Ave | [(-122.2099,37.989),(-122.2089,37)] | Lafayette Lakehurst Cir | [(-122.284729,37.89025),(-122.286096,37.90364)] | Berkeley - Lakeshore Ave | [(-122.2586,37.99),(-122.2556,37.006)] | Berkeley Lakeshore Ave | [(-122.2586,37.99),(-122.2556,37.006)] | Lafayette + Lakeshore Ave | [(-122.2586,37.99),(-122.2556,37.006)] | Berkeley Las Positas Road | [(-121.764488,37.99199),(-121.75569,37.02022)] | Oakland Las Positas Road | [(-121.764488,37.99199),(-121.75569,37.02022)] | Oakland - Linden St | [(-122.2867,37.998),(-122.2864,37.008)] | Berkeley Linden St | [(-122.2867,37.998),(-122.2864,37.008)] | Lafayette + Linden St | [(-122.2867,37.998),(-122.2864,37.008)] | Berkeley Livermore Ave | [(-121.7687,37.448),(-121.769,37.375)] | Oakland Livermore Ave | [(-121.7687,37.448),(-121.769,37.375)] | Oakland Livermore Ave | [(-121.772719,37.99085),(-121.7728,37.001)] | Oakland Livermore Ave | [(-121.772719,37.99085),(-121.7728,37.001)] | Oakland - Locust St | [(-122.1606,37.007),(-122.1593,37.987)] | Oakland Locust St | [(-122.1606,37.007),(-122.1593,37.987)] | Berkeley + Locust St | [(-122.1606,37.007),(-122.1593,37.987)] | Oakland Logan Ct | [(-122.0053,37.492),(-122.0061,37.484)] | Oakland Magnolia St | [(-122.0971,37.5),(-122.0962,37.484)] | Oakland Mandalay Road | [(-122.2322,37.397),(-122.2321,37.403)] | Lafayette Marin Ave | [(-122.2741,37.894),(-122.272,37.901)] | Berkeley Martin Luther King Jr Way | [(-122.2712,37.608),(-122.2711,37.599)] | Berkeley Mattos Dr | [(-122.0005,37.502),(-122.000898,37.49683)] | Oakland - Maubert Ave | [(-122.1114,37.009),(-122.1096,37.995)] | Oakland Maubert Ave | [(-122.1114,37.009),(-122.1096,37.995)] | Berkeley - McClure Ave | [(-122.1431,37.001),(-122.1436,37.998)] | Oakland + Maubert Ave | [(-122.1114,37.009),(-122.1096,37.995)] | Oakland McClure Ave | [(-122.1431,37.001),(-122.1436,37.998)] | Berkeley + McClure Ave | [(-122.1431,37.001),(-122.1436,37.998)] | Oakland Medlar Dr | [(-122.0627,37.378),(-122.0625,37.375)] | Oakland Mildred Ct | [(-122.0002,37.388),(-121.9998,37.386)] | Oakland Miller Road | [(-122.0902,37.645),(-122.0865,37.545)] | Berkeley @@ -242,8 +255,8 @@ SELECT * FROM street; Parkridge Dr | [(-122.1438,37.884),(-122.1428,37.9)] | Berkeley Parkside Dr | [(-122.0475,37.603),(-122.0443,37.596)] | Berkeley Paseo Padre Pkwy | [(-121.9143,37.005),(-121.913522,37)] | Oakland - Paseo Padre Pkwy | [(-122.0021,37.639),(-121.996,37.628)] | Oakland Paseo Padre Pkwy | [(-122.0021,37.639),(-121.996,37.628)] | Berkeley + Paseo Padre Pkwy | [(-122.0021,37.639),(-121.996,37.628)] | Oakland Pearl St | [(-122.2383,37.594),(-122.2366,37.615)] | Berkeley Periwinkle Road | [(-122.0451,37.301),(-122.044758,37.29844)] | Oakland Pimlico Dr | [(-121.8616,37.998),(-121.8618,37.008)] | Oakland @@ -254,11 +267,11 @@ SELECT * FROM street; Railroad Ave | [(-122.0245,37.013),(-122.0234,37.003),(-122.0223,37.993)] | Oakland Railroad Ave | [(-122.0245,37.013),(-122.0234,37.003),(-122.0223,37.993)] | Berkeley Ranspot Dr | [(-122.0972,37.999),(-122.0959,37)] | Oakland - Ranspot Dr | [(-122.0972,37.999),(-122.0959,37)] | Oakland Ranspot Dr | [(-122.0972,37.999),(-122.0959,37)] | Berkeley + Ranspot Dr | [(-122.0972,37.999),(-122.0959,37)] | Oakland Redding St | [(-122.1978,37.901),(-122.1975,37.895)] | Berkeley - Redwood Road | [(-122.1493,37.98),(-122.1437,37.001)] | Oakland Redwood Road | [(-122.1493,37.98),(-122.1437,37.001)] | Berkeley + Redwood Road | [(-122.1493,37.98),(-122.1437,37.001)] | Oakland Roca Dr | [(-122.0335,37.609),(-122.0314,37.599)] | Berkeley Rosedale Ct | [(-121.9232,37.9),(-121.924,37.897)] | Oakland Sacramento St | [(-122.2799,37.606),(-122.2797,37.597)] | Berkeley @@ -266,8 +279,8 @@ SELECT * FROM street; Saginaw Ct | [(-121.8803,37.898),(-121.8806,37.901)] | Oakland San Andreas Dr | [(-122.0609,37.9),(-122.0614,37.895)] | Berkeley Santa Maria Ave | [(-122.0773,37),(-122.0773,37.98)] | Oakland - Santa Maria Ave | [(-122.0773,37),(-122.0773,37.98)] | Oakland Santa Maria Ave | [(-122.0773,37),(-122.0773,37.98)] | Berkeley + Santa Maria Ave | [(-122.0773,37),(-122.0773,37.98)] | Oakland Shattuck Ave | [(-122.2686,37.904),(-122.2686,37.897)] | Berkeley Sheridan Road | [(-122.2279,37.425),(-122.2253,37.411),(-122.2223,37.377)] | Lafayette Shoreline Dr | [(-122.2657,37.603),(-122.2648,37.6)] | Berkeley @@ -317,27 +330,14 @@ SELECT * FROM street; Welch Creek Road | [(-121.7695,37.386),(-121.7737,37.413)] | Oakland West Loop Road | [(-122.0576,37.604),(-122.0602,37.586)] | Berkeley Western Pacific Railroad Spur | [(-122.0394,37.018),(-122.0394,37.961)] | Oakland - Western Pacific Railroad Spur | [(-122.0394,37.018),(-122.0394,37.961)] | Oakland Western Pacific Railroad Spur | [(-122.0394,37.018),(-122.0394,37.961)] | Berkeley + Western Pacific Railroad Spur | [(-122.0394,37.018),(-122.0394,37.961)] | Oakland Whitlock Creek | [(-121.74683,37.91276),(-121.733107,37)] | Oakland Whitlock Creek | [(-121.74683,37.91276),(-121.733107,37)] | Oakland Willimet Way | [(-122.0964,37.517),(-122.0949,37.493)] | Oakland Wisconsin St | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)] | Oakland Wisconsin St | [(-122.1994,37.017),(-122.1975,37.998),(-122.1971,37.994)] | Berkeley Wp Railroad | [(-122.254,37.902),(-122.2506,37.891)] | Berkeley - 100th Ave | [(-122.1657,37.429),(-122.1647,37.432)] | Oakland - 107th Ave | [(-122.1555,37.403),(-122.1531,37.41)] | Oakland - 14th St | [(-122.299,37.147),(-122.3,37.148)] | Lafayette - 19th Ave | [(-122.2366,37.897),(-122.2359,37.905)] | Berkeley - 1st St | [(-121.75508,37.89294),(-121.753581,37.90031)] | Oakland - 5th St | [(-122.278,37),(-122.2792,37.005),(-122.2803,37.009)] | Lafayette - 5th St | [(-122.296,37.615),(-122.2953,37.598)] | Berkeley - 82nd Ave | [(-122.1695,37.596),(-122.1681,37.603)] | Berkeley - 85th Ave | [(-122.1877,37.466),(-122.186,37.476)] | Oakland - 89th Ave | [(-122.1822,37.459),(-122.1803,37.471)] | Oakland - 98th Ave | [(-122.1568,37.498),(-122.1558,37.502)] | Oakland - 98th Ave | [(-122.1693,37.438),(-122.1682,37.444)] | Oakland - 98th Ave | [(-122.2001,37.258),(-122.1974,37.27)] | Lafayette (333 rows) SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2; diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out index 95ba66e95e..2ea4434f51 100644 --- a/src/test/regress/expected/txid.out +++ b/src/test/regress/expected/txid.out @@ -238,9 +238,11 @@ SELECT txid_snapshot '1:9223372036854775807:3'; (1 row) SELECT txid_snapshot '1:9223372036854775808:3'; -ERROR: invalid input syntax for type pg_snapshot: "1:9223372036854775808:3" -LINE 1: SELECT txid_snapshot '1:9223372036854775808:3'; - ^ + txid_snapshot +------------------------- + 1:9223372036854775808:3 +(1 row) + -- test txid_current_if_assigned BEGIN; SELECT txid_current_if_assigned() IS NULL; diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out index d3ac08c9ee..952019b2e2 100644 --- a/src/test/regress/expected/type_sanity.out +++ b/src/test/regress/expected/type_sanity.out @@ -19,7 +19,7 @@ WHERE t1.typnamespace = 0 OR (t1.typlen <= 0 AND t1.typlen != -1 AND t1.typlen != -2) OR (t1.typtype not in ('b', 'c', 'd', 'e', 'm', 'p', 'r')) OR NOT t1.typisdefined OR - (t1.typalign not in ('c', 's', 'i', 'd')) OR + (t1.typalign not in ('c', 's', 'i', 'd', 'x')) OR (t1.typstorage not in ('p', 'x', 'e', 'm')); oid | typname -----+--------- @@ -32,7 +32,8 @@ WHERE t1.typbyval AND (t1.typlen != 1 OR t1.typalign != 'c') AND (t1.typlen != 2 OR t1.typalign != 's') AND (t1.typlen != 4 OR t1.typalign != 'i') AND - (t1.typlen != 8 OR t1.typalign != 'd'); + (t1.typlen != 8 OR t1.typalign != 'd') AND + (t1.typlen != 8 OR t1.typalign != 'x'); oid | typname -----+--------- (0 rows) diff --git a/src/test/regress/expected/xid.out b/src/test/regress/expected/xid.out index d8e76f3321..3252bdc28d 100644 --- a/src/test/regress/expected/xid.out +++ b/src/test/regress/expected/xid.out @@ -8,9 +8,9 @@ select '010'::xid, '42'::xid8, '0xffffffffffffffff'::xid8, '-1'::xid8; - xid | xid | xid | xid | xid8 | xid8 | xid8 | xid8 ------+-----+------------+------------+------+------+----------------------+---------------------- - 8 | 42 | 4294967295 | 4294967295 | 8 | 42 | 18446744073709551615 | 18446744073709551615 + xid | xid | xid | xid | xid8 | xid8 | xid8 | xid8 +-----+-----+------------+----------------------+------+------+----------------------+---------------------- + 8 | 42 | 4294967295 | 18446744073709551615 | 8 | 42 | 18446744073709551615 | 18446744073709551615 (1 row) -- garbage values are not yet rejected (perhaps they should be) @@ -381,9 +381,11 @@ SELECT pg_snapshot '1:9223372036854775807:3'; (1 row) SELECT pg_snapshot '1:9223372036854775808:3'; -ERROR: invalid input syntax for type pg_snapshot: "1:9223372036854775808:3" -LINE 1: SELECT pg_snapshot '1:9223372036854775808:3'; - ^ + pg_snapshot +------------------------- + 1:9223372036854775808:3 +(1 row) + -- test pg_current_xact_id_if_assigned BEGIN; SELECT pg_current_xact_id_if_assigned() IS NULL; diff --git a/src/test/regress/expected/xid64.out b/src/test/regress/expected/xid64.out new file mode 100644 index 0000000000..c30c5b5739 --- /dev/null +++ b/src/test/regress/expected/xid64.out @@ -0,0 +1,92 @@ +--- +--- Unit test for xid64 functions +--- +-- directory paths and dlsuffix are passed to us in environment variables +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX +\set regresslib :libdir '/regress' :dlsuffix +CREATE FUNCTION xid64_test_1(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_1' LANGUAGE C STRICT; +CREATE FUNCTION xid64_test_2(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_2' LANGUAGE C STRICT; +CREATE FUNCTION xid64_test_double_xmax(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_double_xmax' LANGUAGE C STRICT; +--- +--- Check page consistency after conversion +--- +CREATE UNLOGGED TABLE test_xid64_table(a int); +ALTER TABLE test_xid64_table SET (autovacuum_enabled = false); +INSERT INTO test_xid64_table(a) SELECT a FROM generate_series(1, 1000) AS a; +SELECT xid64_test_1('test_xid64_table'); +INFO: test 1: page is converted to xid64 format + xid64_test_1 +-------------- + +(1 row) + +DROP TABLE test_xid64_table; +--- +--- Check tuples consistency after conversion +--- +CREATE UNLOGGED TABLE test_xid64_table(s serial, i int, t text); +ALTER TABLE test_xid64_table SET (autovacuum_enabled = false); +DO $$ +BEGIN + FOR j IN 1..20 LOOP + INSERT INTO test_xid64_table(i, t) VALUES (random()::int, md5(random()::text)); + COMMIT; + END LOOP; +END $$; +DO $$ +BEGIN + FOR j IN 1..10 LOOP + DELETE FROM test_xid64_table WHERE ctid IN (SELECT ctid FROM test_xid64_table TABLESAMPLE BERNOULLI (5)); + COMMIT; + END LOOP; +END $$; +SELECT xid64_test_2('test_xid64_table'); + xid64_test_2 +-------------- + +(1 row) + +DROP TABLE test_xid64_table; +--- +--- Check tuples consistency after conversion to double xmax (on full page) +--- +CREATE UNLOGGED TABLE test_xid64_table(i int); +DO $$ +BEGIN + FOR j IN 1..40 LOOP + INSERT INTO test_xid64_table SELECT i FROM generate_series(1, 100) AS i; + COMMIT; + END LOOP; +END $$; +SELECT xid64_test_2('test_xid64_table'); + xid64_test_2 +-------------- + +(1 row) + +DROP TABLE test_xid64_table; +CREATE UNLOGGED TABLE test_xid64_table(i text); +INSERT INTO test_xid64_table(i) VALUES ('NNBABCDSDFGHJKLP'); +DO $$ +BEGIN + FOR j IN 1..40 LOOP + INSERT INTO test_xid64_table(i) SELECT 'A' FROM generate_series(1, 100) AS i; + COMMIT; + END LOOP; +END $$; +SELECT xid64_test_double_xmax('test_xid64_table'); +INFO: test double xmax: page 0 is converted into double xmax format +INFO: test double xmax: end + xid64_test_double_xmax +------------------------ + +(1 row) + +DROP TABLE test_xid64_table; +DROP FUNCTION xid64_test_1(rel regclass); +DROP FUNCTION xid64_test_2(rel regclass); +DROP FUNCTION xid64_test_double_xmax(rel regclass); diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 9f644a0c1b..2344cb38b3 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -33,7 +33,7 @@ test: strings numerology point lseg line box path polygon circle date time timet # geometry depends on point, lseg, line, box, path, polygon, circle # horology depends on date, time, timetz, timestamp, timestamptz, interval # ---------- -test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid mvcc +test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid xid64 mvcc # ---------- # Load huge amounts of data diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index dda076847a..b6ede1f800 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -2252,7 +2252,7 @@ regression_main(int argc, char *argv[], /* initdb */ header(_("initializing database system")); snprintf(buf, sizeof(buf), - "\"%s%sinitdb\" -D \"%s/data\" --no-clean --no-sync%s%s > \"%s/log/initdb.log\" 2>&1", + "\"%s%sinitdb\" -D \"%s/data\" -x 1249835483136 -m 2422361554944 -o 3594887626752 --no-clean --no-sync%s%s > \"%s/log/initdb.log\" 2>&1", bindir ? bindir : "", bindir ? "/" : "", temp_instance, diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 548afb4438..fd6dd77ce5 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -23,6 +23,7 @@ #include "access/htup_details.h" #include "access/transam.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "catalog/namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_type.h" @@ -1257,3 +1258,293 @@ get_columns_length(PG_FUNCTION_ARGS) PG_RETURN_INT32(column_offset); } + +#include "access/hio.h" +#include "access/relation.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + +static void +CheckNewPage(char *msg, Page page) +{ + uint16 size; + + if (PageGetPageLayoutVersion(page) != PG_PAGE_LAYOUT_VERSION) + elog(ERROR, "%s: page version is %d, expected %d ", + msg, PageGetPageLayoutVersion(page), PG_PAGE_LAYOUT_VERSION); + + size = PageGetSpecialSize(page); + if (size == MAXALIGN(sizeof(HeapPageSpecialData))) + elog(INFO, "%s: page is converted to xid64 format", msg); + else if (HeapPageIsDoubleXmax(page)) + elog(INFO, "%s: page is converted into double xmax format", msg); + else + elog(ERROR, "%s: converted page has pageSpecial size %u, expected %llu", + msg, size, + (unsigned long long) MAXALIGN(sizeof(HeapPageSpecialData))); +} + +/* + * Get page from relation. + * Make this page look like in 32-bit xid format. + * Convert it to 64-bit xid format. + * Run basic checks. + */ +PG_FUNCTION_INFO_V1(xid64_test_1); +Datum +xid64_test_1(PG_FUNCTION_ARGS) +{ + Oid relid; + Relation rel; + Buffer buf; + Page page; + PageHeader hdr; + + relid = PG_GETARG_OID(0); + rel = relation_open(relid, AccessExclusiveLock); + buf = ReadBuffer(rel, 0); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buf); + hdr = (PageHeader) page; + + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HeapPageSpecialData))) + elog(ERROR, "page expected in new format"); + + if (PageGetPageLayoutVersion(page) != PG_PAGE_LAYOUT_VERSION) + elog(ERROR, "unknown page version (%u)", + PageGetPageLayoutVersion(page)); + + hdr->pd_special = BLCKSZ; + PageSetPageSizeAndVersion(page, BLCKSZ, PG_PAGE_LAYOUT_VERSION - 1); + + convert_page(rel, page, buf, 0); + CheckNewPage("test 1", page); + + UnlockReleaseBuffer(buf); + relation_close(rel, AccessExclusiveLock); + + PG_RETURN_VOID(); +} + +typedef struct TupleCheckValues +{ + TransactionId xmin; + TransactionId xmax; +} TupleCheckValues; + +typedef struct RelCheckValues +{ + TupleCheckValues *tcv; + Size ntuples; +} RelCheckValues; + +static RelCheckValues +FillRelCheckValues(Relation rel, Buffer buffer, Page page) +{ + RelCheckValues set; + Size n; + +#define DEFAULT_SET_SIZE 64 + n = DEFAULT_SET_SIZE; + set.ntuples = 0; + set.tcv = palloc(sizeof(set.tcv[0]) * n); + + { + OffsetNumber maxoff, + offnum; + HeapTupleHeader tuphdr; + ItemId itemid; + HeapTupleData tuple; + TransactionId xmin, + xmax; + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + tuphdr = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_data = tuphdr; + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + if (HeapPageGetSpecial(page) == heapDoubleXmaxSpecial) + { + xmin = tuphdr->t_choice.t_heap.t_xmin; + xmax = tuphdr->t_choice.t_heap.t_xmax; + } + else + { + HeapTupleCopyBaseFromPage(buffer, &tuple, page, + IsToastRelation(rel)); + + xmin = HeapTupleGetRawXmin(&tuple); + xmax = HeapTupleGetRawXmax(&tuple); + } + + if (set.ntuples == n) + { + n *= 2; + set.tcv = repalloc(set.tcv, sizeof(set.tcv[0]) * n); + } + + set.tcv[set.ntuples].xmin = xmin; + set.tcv[set.ntuples].xmax = xmax; + set.ntuples++; + } + } + + return set; +} + +/* + * Test xmin/xmax invariant when converting page from 32bit xid to 64xid. + * + * Scenario: + * - enforce all relation pages to 32bit xid format, discarding pd_xid_base and + * pd_multi_base + * - store all xmin/xmax in array + * - convert all the pages from relation into 64xid format + * - store all new xmin/xmax in array + * - compare old and new xmin/xmax + * + * NOTE: inital xid value does not affect test as pd_xid_base/pd_multi_base + * discarded. + */ +PG_FUNCTION_INFO_V1(xid64_test_2); +Datum +xid64_test_2(PG_FUNCTION_ARGS) +{ + Oid relid; + Relation rel; + RelCheckValues before, + after; + BlockNumber pageno, + npages; + Size i; + + relid = PG_GETARG_OID(0); + rel = relation_open(relid, AccessExclusiveLock); + npages = RelationGetNumberOfBlocks(rel); + + for (pageno = 0; pageno != npages; ++pageno) + { + Buffer buf; + Page page; + PageHeader hdr; + + /* get page */ + buf = ReadBuffer(rel, pageno); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + hdr = (PageHeader) page; + + /* make page look like 32-bit xid page */ + hdr->pd_special = BLCKSZ; + PageSetPageSizeAndVersion(page, BLCKSZ, PG_PAGE_LAYOUT_VERSION - 1); + + before = FillRelCheckValues(rel, buf, page); + convert_page(rel, page, buf, pageno); + after = FillRelCheckValues(rel, buf, page); + + /* check */ + if (before.ntuples != after.ntuples) + elog(ERROR, "numer of tuples must be equal"); + + for (i = 0; i != before.ntuples; ++i) + { + if (before.tcv[i].xmin != after.tcv[i].xmin && after.tcv[i].xmin) + elog(ERROR, "old and new xmin does not match (%llu != %llu)", + (unsigned long long) before.tcv[i].xmin, + (unsigned long long) after.tcv[i].xmin); + + if (before.tcv[i].xmax != after.tcv[i].xmax) + elog(ERROR, "old and new xmax does not match (%llu != %llu)", + (unsigned long long) before.tcv[i].xmax, + (unsigned long long) after.tcv[i].xmax); + } + + Assert(npages != 0); + pfree(before.tcv); + pfree(after.tcv); + + UnlockReleaseBuffer(buf); + } + + relation_close(rel, AccessExclusiveLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(xid64_test_double_xmax); +Datum +xid64_test_double_xmax(PG_FUNCTION_ARGS) +{ + Oid relid; + Relation rel; + BlockNumber pageno, + npages; + bool found; + + relid = PG_GETARG_OID(0); + rel = relation_open(relid, AccessExclusiveLock); + npages = RelationGetNumberOfBlocks(rel); + found = false; + + for (pageno = 0; pageno != npages; ++pageno) + { + Buffer buf; + Page page; + PageHeader hdr; + ItemId itemid; + OffsetNumber offnum; + HeapTupleHeader tuphdr; + + buf = ReadBuffer(rel, pageno); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + hdr = (PageHeader) page; + + if (pageno == 0) + { + itemid = PageGetItemId(page, FirstOffsetNumber); + itemid->lp_len += 16; /* Move to overlap special */ + } + + for (offnum = FirstOffsetNumber; + offnum <= PageGetMaxOffsetNumber(page); + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + tuphdr = (HeapTupleHeader) PageGetItem(page, itemid); + tuphdr->t_infomask |= HEAP_XMIN_COMMITTED; + } + + hdr->pd_special = BLCKSZ; + PageSetPageSizeAndVersion(page, BLCKSZ, PG_PAGE_LAYOUT_VERSION - 1); + + convert_page(rel, page, buf, pageno); + + if (HeapPageIsDoubleXmax(page)) + { + found = true; + elog(INFO, "test double xmax: page %u is converted into double xmax format", + pageno); + } + + UnlockReleaseBuffer(buf); + } + + if (!found) + elog(ERROR, "test double xmax: failed, no double xmax"); + + Assert(npages != 0); + elog(INFO, "test double xmax: end"); + + relation_close(rel, AccessExclusiveLock); + + PG_RETURN_VOID(); +} diff --git a/src/test/regress/sql/indirect_toast.sql b/src/test/regress/sql/indirect_toast.sql index 3e2f6c0237..ea087b5128 100644 --- a/src/test/regress/sql/indirect_toast.sql +++ b/src/test/regress/sql/indirect_toast.sql @@ -76,7 +76,18 @@ SELECT substring(indtoasttest::text, 1, 200) FROM indtoasttest; VACUUM FREEZE indtoasttest; SELECT substring(indtoasttest::text, 1, 200) FROM indtoasttest; +create or replace function random_string(len integer) returns text as $$ +select substr((select string_agg(r,'') from (select random()::text as r from generate_series(1,(len+15)/16)) s1), 1, len); +$$ language sql; + +create table toasttest_main(t text); +alter table toasttest_main alter column t set storage main; + +insert into toasttest_main (select random_string(len) from generate_series(7000,8000) len); + DROP TABLE indtoasttest; +DROP TABLE toasttest_main; DROP FUNCTION update_using_indirect(); +DROP FUNCTION random_string(integer); RESET default_toast_compression; diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index bdcffd0314..7ada0801eb 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -55,7 +55,7 @@ INSERT INTO large_tuple_test (select 3, NULL); -- now this tuple won't fit on the second page, but the insert should -- still succeed by extending the relation -INSERT INTO large_tuple_test (select 4, repeat('a', 8126)); +INSERT INTO large_tuple_test (select 4, repeat('a', 8112)); DROP TABLE large_tuple_test; @@ -597,3 +597,18 @@ alter table returningwrtest2 drop c; alter table returningwrtest attach partition returningwrtest2 for values in (2); insert into returningwrtest values (2, 'foo') returning returningwrtest; drop table returningwrtest; + +-- Check for MaxHeapTupleSize +create table maxheaptuplesize_test(value text); +alter table maxheaptuplesize_test alter column value set storage external; +insert into maxheaptuplesize_test values (repeat('x', 8104)); +insert into maxheaptuplesize_test values (repeat('x', 8112)); +insert into maxheaptuplesize_test values (repeat('x', 8120)); +insert into maxheaptuplesize_test values (repeat('x', 8128)); +insert into maxheaptuplesize_test values (repeat('x', 8136)); +insert into maxheaptuplesize_test values (repeat('x', 8144)); +insert into maxheaptuplesize_test values (repeat('x', 8152)); +insert into maxheaptuplesize_test values (repeat('x', 8160)); +insert into maxheaptuplesize_test values (repeat('x', 8168)); +insert into maxheaptuplesize_test values (repeat('x', 8176)); +drop table maxheaptuplesize_test; diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql index e742f13699..70e663e350 100644 --- a/src/test/regress/sql/select_views.sql +++ b/src/test/regress/sql/select_views.sql @@ -3,7 +3,7 @@ -- test the views defined in CREATE_VIEWS -- -SELECT * FROM street; +SELECT * FROM street ORDER BY name COLLATE "C", thepath::text COLLATE "C"; SELECT name, #thepath FROM iexit ORDER BY name COLLATE "C", 2; diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql index 5edc1f1f6e..2b6c4aff6c 100644 --- a/src/test/regress/sql/type_sanity.sql +++ b/src/test/regress/sql/type_sanity.sql @@ -22,7 +22,7 @@ WHERE t1.typnamespace = 0 OR (t1.typlen <= 0 AND t1.typlen != -1 AND t1.typlen != -2) OR (t1.typtype not in ('b', 'c', 'd', 'e', 'm', 'p', 'r')) OR NOT t1.typisdefined OR - (t1.typalign not in ('c', 's', 'i', 'd')) OR + (t1.typalign not in ('c', 's', 'i', 'd', 'x')) OR (t1.typstorage not in ('p', 'x', 'e', 'm')); -- Look for "pass by value" types that can't be passed by value. @@ -33,7 +33,8 @@ WHERE t1.typbyval AND (t1.typlen != 1 OR t1.typalign != 'c') AND (t1.typlen != 2 OR t1.typalign != 's') AND (t1.typlen != 4 OR t1.typalign != 'i') AND - (t1.typlen != 8 OR t1.typalign != 'd'); + (t1.typlen != 8 OR t1.typalign != 'd') AND + (t1.typlen != 8 OR t1.typalign != 'x'); -- Look for "toastable" types that aren't varlena. diff --git a/src/test/regress/sql/xid64.sql b/src/test/regress/sql/xid64.sql new file mode 100644 index 0000000000..caa97a0ed9 --- /dev/null +++ b/src/test/regress/sql/xid64.sql @@ -0,0 +1,84 @@ +--- +--- Unit test for xid64 functions +--- + +-- directory paths and dlsuffix are passed to us in environment variables +\getenv libdir PG_LIBDIR +\getenv dlsuffix PG_DLSUFFIX + +\set regresslib :libdir '/regress' :dlsuffix + +CREATE FUNCTION xid64_test_1(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_1' LANGUAGE C STRICT; +CREATE FUNCTION xid64_test_2(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_2' LANGUAGE C STRICT; +CREATE FUNCTION xid64_test_double_xmax(rel regclass) RETURNS VOID + AS :'regresslib', 'xid64_test_double_xmax' LANGUAGE C STRICT; + +--- +--- Check page consistency after conversion +--- +CREATE UNLOGGED TABLE test_xid64_table(a int); +ALTER TABLE test_xid64_table SET (autovacuum_enabled = false); +INSERT INTO test_xid64_table(a) SELECT a FROM generate_series(1, 1000) AS a; +SELECT xid64_test_1('test_xid64_table'); +DROP TABLE test_xid64_table; + +--- +--- Check tuples consistency after conversion +--- +CREATE UNLOGGED TABLE test_xid64_table(s serial, i int, t text); +ALTER TABLE test_xid64_table SET (autovacuum_enabled = false); + +DO $$ +BEGIN + FOR j IN 1..20 LOOP + INSERT INTO test_xid64_table(i, t) VALUES (random()::int, md5(random()::text)); + COMMIT; + END LOOP; +END $$; + +DO $$ +BEGIN + FOR j IN 1..10 LOOP + DELETE FROM test_xid64_table WHERE ctid IN (SELECT ctid FROM test_xid64_table TABLESAMPLE BERNOULLI (5)); + COMMIT; + END LOOP; +END $$; + +SELECT xid64_test_2('test_xid64_table'); +DROP TABLE test_xid64_table; + +--- +--- Check tuples consistency after conversion to double xmax (on full page) +--- +CREATE UNLOGGED TABLE test_xid64_table(i int); + +DO $$ +BEGIN + FOR j IN 1..40 LOOP + INSERT INTO test_xid64_table SELECT i FROM generate_series(1, 100) AS i; + COMMIT; + END LOOP; +END $$; + +SELECT xid64_test_2('test_xid64_table'); +DROP TABLE test_xid64_table; + +CREATE UNLOGGED TABLE test_xid64_table(i text); +INSERT INTO test_xid64_table(i) VALUES ('NNBABCDSDFGHJKLP'); + +DO $$ +BEGIN + FOR j IN 1..40 LOOP + INSERT INTO test_xid64_table(i) SELECT 'A' FROM generate_series(1, 100) AS i; + COMMIT; + END LOOP; +END $$; + +SELECT xid64_test_double_xmax('test_xid64_table'); +DROP TABLE test_xid64_table; + +DROP FUNCTION xid64_test_1(rel regclass); +DROP FUNCTION xid64_test_2(rel regclass); +DROP FUNCTION xid64_test_double_xmax(rel regclass); diff --git a/src/test/xid-64/Makefile b/src/test/xid-64/Makefile new file mode 100644 index 0000000000..3b1e50dfc0 --- /dev/null +++ b/src/test/xid-64/Makefile @@ -0,0 +1,22 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/xid-64 +# +# Copyright (c) 2018, Postgres Professional +# +# src/test/xid-64/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/test/xid-64 +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean maintainer-clean: + rm -rf tmp_check diff --git a/src/test/xid-64/README b/src/test/xid-64/README new file mode 100644 index 0000000000..01c0a1a1f7 --- /dev/null +++ b/src/test/xid-64/README @@ -0,0 +1,16 @@ +src/test/xid-64/README + +Regression tests for 64-bit XIDs +============================================= + +This directory contains a test suite for 64-bit xids. + +Running the tests +================= + + make check + +NOTE: This creates a temporary installation, and some tests may +create one or multiple nodes. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/xid-64/meson.build b/src/test/xid-64/meson.build new file mode 100644 index 0000000000..f55ebdf41c --- /dev/null +++ b/src/test/xid-64/meson.build @@ -0,0 +1,15 @@ +tests += { + 'name': 'xid-64', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_test_large_xids.pl', + 't/002_test_gucs.pl', + 't/003_test_integrity.pl', + 't/004_test_relminmxid.pl', + 't/005_stream_subxact.pl', + 't/006_zeropage.pl', + ], + }, +} diff --git a/src/test/xid-64/t/001_test_large_xids.pl b/src/test/xid-64/t/001_test_large_xids.pl new file mode 100644 index 0000000000..4c7dbc6cb1 --- /dev/null +++ b/src/test/xid-64/t/001_test_large_xids.pl @@ -0,0 +1,54 @@ +# Tests for large xid values +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use bigint; + +sub command_output +{ + my ($cmd) = @_; + my ($stdout, $stderr); + print("# Running: " . join(" ", @{$cmd}) . "\n"); + my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr; + ok($result, "@$cmd exit code 0"); + is($stderr, '', "@$cmd no stderr"); + return $stdout; +} + +my $START_VAL = 2**32; +my $MAX_VAL = 2**62; + +my $ixid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imxid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imoff = $START_VAL + int(rand($MAX_VAL - $START_VAL)); + +# Initialize master node with the random xid-related parameters +my $node = PostgreSQL::Test::Cluster->new('master'); +$node->init(extra => [ "--xid=$ixid", "--multixact-id=$imxid", "--multixact-offset=$imoff" ]); +$node->start; + +# Initialize master node and check the xid-related parameters +my $pgcd_output = command_output( + [ 'pg_controldata', '-D', $node->data_dir ] ); +print($pgcd_output); print('\n'); +ok($pgcd_output =~ qr/Latest checkpoint's NextXID:\s*(\d+)/, "XID found"); +my ($nextxid) = ($1); +ok($nextxid >= $ixid && $nextxid < $ixid + 1000, + "Latest checkpoint's NextXID ($nextxid) is close to the initial xid ($ixid)."); +ok($pgcd_output =~ qr/Latest checkpoint's NextMultiXactId:\s*(\d+)/, "MultiXactId found"); +my ($nextmxid) = ($1); +ok($nextmxid >= $imxid && $nextmxid < $imxid + 1000, + "Latest checkpoint's NextMultiXactId ($nextmxid) is close to the initial multiXactId ($imxid)."); +ok($pgcd_output =~ qr/Latest checkpoint's NextMultiOffset:\s*(\d+)/, "MultiOffset found"); +my ($nextmoff) = ($1); +ok($nextmoff >= $imoff && $nextmoff < $imoff + 1000, + "Latest checkpoint's NextMultiOffset ($nextmoff) is close to the initial multiOffset ($imoff)."); + +# Run pgbench to check whether the database is working properly +$node->command_ok( + [ qw(pgbench --initialize --no-vacuum --scale=10) ], + 'pgbench finished without errors'); + +done_testing(); \ No newline at end of file diff --git a/src/test/xid-64/t/002_test_gucs.pl b/src/test/xid-64/t/002_test_gucs.pl new file mode 100644 index 0000000000..ff9f2f3052 --- /dev/null +++ b/src/test/xid-64/t/002_test_gucs.pl @@ -0,0 +1,79 @@ +# Tests for guc boundary values +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use bigint; + +sub command_output +{ + my ($cmd) = @_; + my ($stdout, $stderr); + print("# Running: " . join(" ", @{$cmd}) . "\n"); + my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr; + ok($result, "@$cmd exit code 0"); + is($stderr, '', "@$cmd no stderr"); + return $stdout; +} + +sub set_guc +{ + my ($node, $guc, $val) = @_; + print("SET $guc = $val\n"); + $node->safe_psql('postgres', "ALTER SYSTEM SET $guc = $val"); + $node->restart(); +} + +sub test_pgbench +{ + my ($node) = @_; + $node->command_ok( + [ qw(pgbench --progress=5 --transactions=1000 --jobs=5 --client=5) ], + 'pgbench finished without errors'); +} + +my @guc_vals = ( + [ "autovacuum_freeze_max_age", 100000, 2**63 - 1 ], + [ "autovacuum_multixact_freeze_max_age", 10000, 2**63 - 1 ], + [ "vacuum_freeze_min_age", 0, 2**63 - 1 ], + [ "vacuum_freeze_table_age", 0, 2**63 - 1 ], + [ "vacuum_multixact_freeze_min_age", 0, 2**63 - 1 ], + [ "vacuum_multixact_freeze_table_age", 0, 2**63 -1 ] +); + +my $START_VAL = 2**32; +my $MAX_VAL = 2**62; + +my $ixid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imxid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imoff = $START_VAL + int(rand($MAX_VAL - $START_VAL)); + +# Initialize master node +my $node = PostgreSQL::Test::Cluster->new('master'); +$node->init(extra => [ "--xid=$ixid", "--multixact-id=$imxid", "--multixact-offset=$imoff" ]); +# Disable logging of all statements to avoid log bloat during pgbench +$node->append_conf('postgresql.conf', "log_statement = none"); +$node->start; + +# Fill the test database with the pgbench data +$node->command_ok( + [ qw(pgbench --initialize --scale=10) ], + 'pgbench finished without errors'); + +# Test all GUCs with minimum, maximum and random value inbetween +# (run pgbench for every configuration setting) +foreach my $gi (0 .. $#guc_vals) { + print($guc_vals[$gi][0]); print("\n"); + my $guc = $guc_vals[$gi][0]; + my $minval = $guc_vals[$gi][1]; + my $maxval = $guc_vals[$gi][2]; + set_guc($node, $guc, $minval); + test_pgbench($node); + set_guc($node, $guc, $maxval); + test_pgbench($node); + set_guc($node, $guc, $minval + int(rand($maxval - $minval))); + test_pgbench($node); +} + +done_testing(); \ No newline at end of file diff --git a/src/test/xid-64/t/003_test_integrity.pl b/src/test/xid-64/t/003_test_integrity.pl new file mode 100644 index 0000000000..ca079f11cb --- /dev/null +++ b/src/test/xid-64/t/003_test_integrity.pl @@ -0,0 +1,58 @@ +# Check integrity after dump/restore with different xids +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Compare; + +my $tempdir = PostgreSQL::Test::Utils::tempdir; +use bigint; + +my $START_VAL = 2**32; +my $MAX_VAL = 2**62; + +my $ixid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imxid = $START_VAL + int(rand($MAX_VAL - $START_VAL)); +my $imoff = $START_VAL + int(rand($MAX_VAL - $START_VAL)); + +# Initialize master node +my $node = PostgreSQL::Test::Cluster->new('master'); +$node->init(); +$node->start; + +# Create a database and fill it with the pgbench data +$node->safe_psql('postgres', "CREATE DATABASE pgbench_db"); +$node->command_ok( + [ qw(pgbench --initialize --scale=2 pgbench_db) ], + 'pgbench finished without errors'); +# Dump the database (cluster the main table to put data in a determined order) +$node->safe_psql('pgbench_db', qq( + CREATE INDEX pa_aid_idx ON pgbench_accounts (aid); + CLUSTER pgbench_accounts USING pa_aid_idx)); +$node->command_ok( + [ "pg_dump", "-w", "--inserts", "--file=$tempdir/pgbench.sql", "pgbench_db" ], + 'pgdump finished without errors'); +$node->stop('fast'); + +# Initialize second node +my $node2 = PostgreSQL::Test::Cluster->new('master2'); +$node2->init(extra => [ "--xid=$ixid", "--multixact-id=$imxid", "--multixact-offset=$imoff" ]); +# Disable logging of all statements to avoid log bloat during restore +$node2->append_conf('postgresql.conf', "log_statement = none"); +$node2->start; + +# Create a database and restore the previous dump +$node2->safe_psql('postgres', "CREATE DATABASE pgbench_db"); +my $txid0 = $node2->safe_psql('pgbench_db', 'SELECT txid_current()'); +print("# Initial txid_current: $txid0\n"); +$node2->command_ok(["psql", "-q", "-f", "$tempdir/pgbench.sql", "pgbench_db"]); + +# Dump the database and compare the dumped content with the previous one +$node2->safe_psql('pgbench_db', 'CLUSTER pgbench_accounts'); +$node2->command_ok( + [ "pg_dump", "-w", "--inserts", "--file=$tempdir/pgbench2.sql", "pgbench_db" ], + 'pgdump finished without errors'); +ok(File::Compare::compare_text("$tempdir/pgbench.sql", "$tempdir/pgbench2.sql") == 0, "no differences detected"); + +done_testing(); \ No newline at end of file diff --git a/src/test/xid-64/t/004_test_relminmxid.pl b/src/test/xid-64/t/004_test_relminmxid.pl new file mode 100644 index 0000000000..e1f6e556e5 --- /dev/null +++ b/src/test/xid-64/t/004_test_relminmxid.pl @@ -0,0 +1,90 @@ +# Check integrity after dump/restore with different xids +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use bigint; + +my ($node, $rmm, $vacout); +$node = PostgreSQL::Test::Cluster->new('master'); +$node->init(extra => [ "--xid=3", "--multixact-id=1", "--multixact-offset=0" ]); +$node->append_conf('postgresql.conf', 'max_prepared_transactions = 2'); +$node->start; + +sub relminmxid +{ + my $rmm = $node->safe_psql("postgres", qq( + SELECT relminmxid + FROM pg_class + WHERE relname = 'foo';)); + return $rmm + 0; +} + +sub vacuum +{ + my ($rc, $stdout, $stderr) = $node->psql("postgres", "VACUUM foo;"); + return $stdout.$stderr; +} + +sub gen_multixact +{ + $node->safe_psql("postgres", qq( + BEGIN; + SELECT * FROM foo FOR KEY SHARE; + PREPARE TRANSACTION 'fooshare'; + )); + + my $xmax = $node->safe_psql("postgres", qq( + SELECT xmax FROM foo; + )); + isnt($xmax + 0, 0, "xmax not empty"); + + $node->safe_psql("postgres", qq( + BEGIN; + SELECT * FROM foo FOR KEY SHARE; + COMMIT; + COMMIT PREPARED 'fooshare'; + )); + + my $mxact = $node->safe_psql("postgres", qq( + SELECT xmax FROM foo; + )); + isnt($mxact + 0, 0, "mxact not empty"); + cmp_ok($xmax, '>', $mxact, "xmax is greater than mxact"); +} + +# Initialize master node with the random xid-related parameters +$node->safe_psql("postgres", "CREATE TABLE foo (a int); INSERT INTO foo VALUES (1);"); + +is(relminmxid(), 1, "relminmxid is default"); + +vacuum(); +is(relminmxid(), 1, "relminmxid is still default"); + +gen_multixact(); +is(relminmxid(), 1, "relminmxid is still still default"); + +unlike(vacuum(), qr/multixact.*before relminmxid/, "no relminmxid error"); + +# No intentionally break relminmxid +$node->safe_psql("postgres", qq( + UPDATE pg_class SET relminmxid = ((1::int8<<62) + 1)::text::xid + WHERE relname = 'foo' +)); +cmp_ok(relminmxid(), '>', 2**62, "relminmxid broken (intentionally)"); + +gen_multixact(); +like(vacuum(), qr/multixact.*before relminmxid/, "got relminmxid error"); +cmp_ok(relminmxid(), '>', 2**62, "relminmxid broken (still)"); + +# Fix relminmxid by setting to default +$node->safe_psql("postgres", qq( + UPDATE pg_class SET relminmxid = '1' + WHERE relname = 'foo' +)); +is(relminmxid(), 1, "relminmxid is default again"); + +unlike(vacuum(), qr/multixact.*before relminmxid/, "no relminmxid error again"); + +done_testing(); diff --git a/src/test/xid-64/t/005_stream_subxact.pl b/src/test/xid-64/t/005_stream_subxact.pl new file mode 100644 index 0000000000..1379af6816 --- /dev/null +++ b/src/test/xid-64/t/005_stream_subxact.pl @@ -0,0 +1,100 @@ + +# Copyright (c) 2021, PostgreSQL Global Development Group + +# Test xids streaming of large transaction containing large subtransactions +# near 32-bit boundary. +# +# Mostly it is a copy of 016_stream_subxact.pl, but with publisher xid inited +# just before 32-bit boundary, so if xids are replicated as 32-bit values, +# subscriber will get 0 xid value. +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Create publisher node +my $node_publisher = PostgreSQL::Test::Cluster->new('publisher'); +$node_publisher->init(allows_streaming => 'logical', extra => ['-x', '4294966545']); +$node_publisher->append_conf('postgresql.conf', + 'logical_decoding_work_mem = 64kB'); +$node_publisher->start; + +# Create subscriber node +my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber'); +$node_subscriber->init(allows_streaming => 'logical'); +$node_subscriber->start; + +# Create some preexisting content on publisher +$node_publisher->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b varchar)"); +$node_publisher->safe_psql('postgres', + "INSERT INTO test_tab VALUES (1, 'foo'), (2, 'bar')"); + +# Setup structure on subscriber +$node_subscriber->safe_psql('postgres', + "CREATE TABLE test_tab (a int primary key, b text, c timestamptz DEFAULT now(), d bigint DEFAULT 999)" +); + +# Setup logical replication +my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres'; +$node_publisher->safe_psql('postgres', + "CREATE PUBLICATION tap_pub FOR TABLE test_tab"); + +my $appname = 'tap_sub'; +$node_subscriber->safe_psql('postgres', + "CREATE SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr application_name=$appname' PUBLICATION tap_pub WITH (streaming = on)" +); + +$node_publisher->wait_for_catchup($appname); + +# Also wait for initial table sync to finish +my $synced_query = + "SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('r', 's');"; +$node_subscriber->poll_query_until('postgres', $synced_query) + or die "Timed out while waiting for subscriber to synchronize data"; + +my $result = + $node_subscriber->safe_psql('postgres', + "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(2|2|2), 'check initial data was copied to subscriber'); + +# Insert, update and delete enough rows to exceed 64kB limit. +$node_publisher->safe_psql( + 'postgres', q{ +BEGIN; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series( 3, 500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s1; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(501, 1000) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s2; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1001, 1500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s3; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(1501, 2000) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +SAVEPOINT s4; +INSERT INTO test_tab SELECT i, md5(i::text) FROM generate_series(2001, 2500) s(i); +UPDATE test_tab SET b = md5(b) WHERE mod(a,2) = 0; +DELETE FROM test_tab WHERE mod(a,3) = 0; +COMMIT; +}); + +$node_publisher->wait_for_catchup($appname); + +$result = + $node_subscriber->safe_psql('postgres', + "SELECT count(*), count(c), count(d = 999) FROM test_tab"); +is($result, qq(1667|1667|1667), + 'check data was copied to subscriber in streaming mode and extra columns contain local defaults' +); + +$node_subscriber->stop; +$node_publisher->stop; + +done_testing(); diff --git a/src/test/xid-64/t/006_zeropage.pl b/src/test/xid-64/t/006_zeropage.pl new file mode 100644 index 0000000000..fd3ac3973f --- /dev/null +++ b/src/test/xid-64/t/006_zeropage.pl @@ -0,0 +1,33 @@ +use strict; +use warnings; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Check WAL for ZEROPAGE record. + +sub command_output +{ + my ($cmd) = @_; + my ($stdout, $stderr); + print("# Running: " . join(" ", @{$cmd}) . "\n"); + my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr; + return $stdout; +} + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(extra => [ "--xid=3", "--multixact-id=3", "--multixact-offset=0" ]);; +$node->start; +my $pgdata = $node->data_dir; +my $xlogfilename0 = $node->safe_psql('postgres', + "SELECT pg_walfile_name(pg_current_wal_lsn())"); +#$node->command_like( +# [ 'pg_waldump', '-S', "$pgdata/pg_wal/$xlogfilename0" ], +# qr/ZEROPAGE/, +# 'pg_waldump prints start timestamp'); +my $wd_output = command_output( + [ 'pg_waldump', "$pgdata/pg_wal/$xlogfilename0" ]); +ok($wd_output =~ qr/ZEROPAGE page 0/, "ZEROPAGE found"); + +done_testing(); diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index c2acb58df0..74f2216ad1 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -399,6 +399,7 @@ sub GenerateFiles PACKAGE_TARNAME => lc qq{"$package_name"}, PACKAGE_URL => qq{"$package_url"}, PACKAGE_VERSION => qq{"$package_version"}, + XID_IS_64BIT => 1, PG_INT128_TYPE => undef, PG_INT64_TYPE => 'long long int', PG_KRB_SRVNAM => qq{"postgres"}, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 97c9bc1861..4912da144f 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3360,8 +3360,8 @@ intset_internal_node intset_leaf_node intset_node intvKEY -itemIdCompact -itemIdCompactData +ItemIdCompact +ItemIdCompactData iterator jmp_buf join_search_hook_type -- 2.37.0 (Apple Git-136)