>From 4b36f4c0e1efecbd50918a2c854bfe6f26105201 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 30 Jan 2015 09:46:15 +0100 Subject: [PATCH 2/4] Fix startup process crash and race during CREATE/DROP/ALTER DATABASE. When replaying database related WAL records the startup process acquires lock on the database object to prevent new connections to the affected database. That's necessary because we currently don't replicate object locks from the primary. Unfortunately that locking couldn't use the standby infrastructure for managing locks and instead just used a normal LockSharedObjectForSession(). While that works in the (common!) uncontended situation, it doesn't if the lock is currently already acquired. The standby process intentionally doesn't set up the infrastructure to wait for locks in the normal, blocking, way. The goal is to cancel other lock holder after max_standby_*_delay, which is only possible if the sleeping is controlled. If we have to wait anyway, we either crash with a assertion or a FATAL (promoted to PANIC) error. There's also another race, which was documented in comments. Namely that not using the standby lock infrastructure requires explicitly releasing the lock. That means there's a short phase where the catalog and physical state of the database are inconsistent. After the previous commit we can now use the standby infrastructure to manage the lock. That requires assigning a transactionid to the create/drop database transactions. To handle WAL generated by a older point release we thus need to make the locking conditional on the relevant records having a transactionid. On the primary WAL format is bumped instead. Additionally add previously missing locking for CREATE DATABASE that could lead to checksum failures due to hint bit writes. Now that we don't do acquire any locks in potentially blocking mode in the startup process anymore, add a assertion ensuring that that stays so in the primary. Backpatch all the way. Discussion: 20150120152819.GC24381@alap3.anarazel.de --- src/backend/commands/dbcommands.c | 71 +++++++++++++++++++++++++++++++-------- src/backend/storage/lmgr/lock.c | 2 ++ 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5e66961..3845eb7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1204,6 +1204,12 @@ movedb(const char *dbname, const char *tblspcname) } /* + * Force xid assignment, for the benefit of dbase_redo()'s internal + * locking. + */ + GetTopTransactionId(); + + /* * Use an ENSURE block to make sure we remove the debris if the copy fails * (eg, due to out-of-disk-space). This is not a 100% solution, because * of the possibility of failure during transaction commit, but it should @@ -1314,6 +1320,12 @@ movedb(const char *dbname, const char *tblspcname) StartTransactionCommand(); /* + * Force xid assignment, for the benefit of dbase_redo()'s internal + * locking. + */ + GetTopTransactionId(); + + /* * Remove files from the old tablespace */ if (!rmtree(src_dbpath, true)) @@ -2053,6 +2065,34 @@ dbase_redo(XLogReaderState *record) dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id); /* + * Can only do the locking if the server generating the record was new + * enough to know it has to assign a xid. + */ + if (InHotStandby && TransactionIdIsValid(XLogRecGetXid(record))) + { + LOCKTAG locktag; + + SET_LOCKTAG_OBJECT(locktag, + InvalidOid, + DatabaseRelationId, + xlrec->src_db_id, + 0); + + /* Lock source database, do avoid concurrent hint bit writes et al. */ + StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock); + ResolveRecoveryConflictWithDatabase(xlrec->src_db_id); + + /* Lock target database, it'll be overwritten in a second */ + SET_LOCKTAG_OBJECT(locktag, + InvalidOid, + DatabaseRelationId, + xlrec->db_id, + 0); + StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock); + ResolveRecoveryConflictWithDatabase(xlrec->src_db_id); + } + + /* * Our theory for replaying a CREATE is to forcibly drop the target * subdirectory if present, then re-copy the source data. This may be * more work than needed, but it is simple to implement. @@ -2086,16 +2126,31 @@ dbase_redo(XLogReaderState *record) dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id); - if (InHotStandby) + /* + * Can only do the locking if the server generating the record was new + * enough to know it has to assign a xid. + */ + if (InHotStandby && TransactionIdIsValid(XLogRecGetXid(record))) { + LOCKTAG locktag; + /* * Lock database while we resolve conflicts to ensure that * InitPostgres() cannot fully re-execute concurrently. This * avoids backends re-connecting automatically to same database, * which can happen in some cases. */ - LockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock); + SET_LOCKTAG_OBJECT(locktag, + InvalidOid, + DatabaseRelationId, + xlrec->db_id, + 0); + StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock); ResolveRecoveryConflictWithDatabase(xlrec->db_id); + + /* Lock pg_database, to conflict with base backups */ + SET_LOCKTAG_RELATION(locktag, 0, DatabaseRelationId); + StandbyAcquireLock(XLogRecGetXid(record), &locktag, RowExclusiveLock); } /* Drop pages for this database that are in the shared buffer cache */ @@ -2112,18 +2167,6 @@ dbase_redo(XLogReaderState *record) ereport(WARNING, (errmsg("some useless files may be left behind in old database directory \"%s\"", dst_path))); - - if (InHotStandby) - { - /* - * Release locks prior to commit. XXX There is a race condition - * here that may allow backends to reconnect, but the window for - * this is small because the gap between here and commit is mostly - * fairly small and it is unlikely that people will be dropping - * databases that we are trying to connect to anyway. - */ - UnlockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock); - } } else elog(PANIC, "dbase_redo: unknown op code %u", info); diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 02ecf3d..f051ad6 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -718,6 +718,8 @@ LockAcquireExtended(const LOCKTAG *locktag, lockMethodTable->lockModeNames[lockmode]), errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery."))); + Assert(!InRecovery || (sessionLock && dontWait)); + #ifdef LOCK_DEBUG if (LOCK_DEBUG_ENABLED(locktag)) elog(LOG, "LockAcquire: lock [%u,%u] %s", -- 2.2.1.212.gc5b9256