From 21bc5c12e374f89d64dfbdf085b4ec06ee997522 Mon Sep 17 00:00:00 2001
From: Craig Ringer <craig@2ndquadrant.com>
Date: Mon, 11 Aug 2014 14:20:03 +0800
Subject: [PATCH] bdr: Log all steps of DDL lock acquisition, always prefix
 with bdr:

The DDL lock acquision logging wasn't logging all the steps, logging
at consistent log levels, or prefixing things with bdr: for easy grepping.

Add more log detail, mention node IDs when messages are received or locks are
acquired.

Comment the individual steps in lock acquisition.
---
 bdr_locks.c | 185 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 149 insertions(+), 36 deletions(-)

diff --git a/bdr_locks.c b/bdr_locks.c
index 9858de91af..b6e774bbb5 100644
--- a/bdr_locks.c
+++ b/bdr_locks.c
@@ -21,26 +21,41 @@
  *    allows to inject transactional and nontransactional messages in the
  *    changestream.
  *
+ *    There are really two levels of DDL lock - the global lock that only
+ *    one node can hold, and individual local DDL locks on each node. If
+ *    a node holds the global DDL lock then it owns the local DDL locks on each
+ *    node.
+ *
  *    DDL lock acquiration basically works like this:
  *
- *    1) A utility command notices that it needs the ddl lock. If there
- *       already is a local ddl lock it'll ERROR out.
+ *    1) A utility command notices that it needs the global ddl lock and the local
+ *       node doesn't already hold it. If there already is a local ddl lock
+ *       it'll ERROR out, as this indicates another node already holds or is
+ *       trying to acquire the global DDL lock.
+ *
  *	  2) It sends out a 'acquire_lock' message to all other nodes.
+ *
  *    3) When another node receives a 'acquire_lock' message it checks whether
- *       the ddl lock is already held locally. If so it'll send a
- *       'decline_lock' message back causing the lock acquiration to fail.
- *    4) If a 'acquire_lock' message is received and the DDL lock is not held
- *    	 locally it'll be acquired and an entry into the 'bdr_global_locks'
- *    	 table will be made marking the lock to be in the 'catchup' phase.
+ *       the local ddl lock is already held. If so it'll send a 'decline_lock'
+ *       message back causing the lock acquiration to fail.
+ *
+ *    4) If a 'acquire_lock' message is received and the local DDL lock is not
+ *       held it'll be acquired and an entry into the 'bdr_global_locks' table
+ *       will be made marking the lock to be in the 'catchup' phase.
+ *
  *    5) All concurrent user transactions will be cancelled.
+ *
  *	  6) A 'request_replay_confirm' message will be sent to all other nodes
  *	     containing a lsn that has to be replayed.
+ *
  *    7) When a 'request_replay_confirm' message is received, a
  *       'replay_confirm' message will be sent back.
+ *
  *    8) Once all other nodes have replied with 'replay_confirm' the DDL lock
  *       has been successfully acquired on the node reading the 'acquire_lock'
  *       message (from 3)). The corresponding bdr_global_locks entry will be
  *       updated to the 'acquired' state and a 'confirm_lock' message will be sent out.
+ *
  *    9) Once all nodes have replied with 'confirm_lock' messages the ddl lock
  *    	 has been acquired.
  *
@@ -249,7 +264,10 @@ bdr_locks_find_my_database(bool create)
 }
 
 /*
+ * This node has just started up. Init its local state and send a startup
+ * announcement message.
  *
+ * Called from the per-db worker.
  */
 void
 bdr_locks_startup(Size nnodes)
@@ -285,7 +303,7 @@ bdr_locks_startup(Size nnodes)
 	 */
 	bdr_prepare_message(&s, BDR_MESSAGE_START);
 
-	elog(DEBUG1, "sending DDL lock startup message");
+	elog(DEBUG1, "bdr: sending DDL lock startup message");
 	lsn = LogStandbyMessage(s.data, s.len, false);
 	resetStringInfo(&s);
 	XLogFlush(lsn);
@@ -322,7 +340,8 @@ bdr_locks_startup(Size nnodes)
 				bdr_locks_find_database(DatumGetObjectId(values[7]), false);
 			db->lock_holder = node_id;
 			db->lockcount++;
-			elog(DEBUG1, "reacquiring DDL lock held before shutdown");
+			/* A remote node might have held the local DDL lock before restart */
+			elog(DEBUG1, "bdr: reacquiring local DDL lock held before shutdown");
 		}
 		else if (strcmp(state, "catchup") == 0)
 		{
@@ -346,10 +365,10 @@ bdr_locks_startup(Size nnodes)
 			db->replay_confirmed = 0;
 			db->replay_confirmed_lsn = wait_for_lsn;
 
-			elog(DEBUG1, "restarting DDL lock replay catchup phase");
+			elog(DEBUG1, "bdr: restarting DDL lock replay catchup phase");
 		}
 		else
-			elog(PANIC, "BDR: unknown lockstate");
+			elog(PANIC, "bdr: unknown lockstate");
 	}
 
 	systable_endscan(scan);
@@ -358,6 +377,8 @@ bdr_locks_startup(Size nnodes)
 
 	CommitTransactionCommand();
 
+	elog(DEBUG2, "bdr: ddl locking startup completed, local DML enabled");
+
 	/* allow local DML */
 	bdr_my_locks_database->locked_and_loaded = true;
 }
@@ -406,7 +427,7 @@ bdr_lock_xact_callback(XactEvent event, void *arg)
 		if (bdr_my_locks_database->lockcount > 0)
 			bdr_my_locks_database->lockcount--;
 		else
-			elog(WARNING, "BDR: releasing unacquired DDL lock");
+			elog(WARNING, "bdr: releasing unacquired DDL lock");
 		LWLockRelease(bdr_locks_ctl->lock);
 		this_xact_acquired_lock = false;
 	}
@@ -455,6 +476,9 @@ locks_begin_scan(Relation rel, Snapshot snap, uint64 sysid, TimeLineID tli, Oid
 
 /*
  * Acquire DDL lock on the side that wants to perform DDL.
+ *
+ * Called from a user backend when the command filter spots a DDL attempt; runs
+ * in the user backend.
  */
 void
 bdr_acquire_ddl_lock(void)
@@ -471,6 +495,8 @@ bdr_acquire_ddl_lock(void)
 
 	bdr_locks_find_my_database(false);
 
+	elog(DEBUG2, "bdr: attempting to acquire global DDL lock for (" BDR_LOCALID_FORMAT ")", BDR_LOCALID_FORMAT_ARGS);
+
 	/* send message about ddl lock */
 	bdr_prepare_message(&s, BDR_MESSAGE_ACQUIRE_LOCK);
 
@@ -493,12 +519,13 @@ bdr_acquire_ddl_lock(void)
 				 errmsg("database is locked against ddl by another node"),
 				 errhint("Some node in the cluster is performing DDL")));
 
+
 	bdr_my_locks_database->acquire_confirmed = 0;
 	bdr_my_locks_database->acquire_declined = 0;
 	bdr_my_locks_database->waiting_latch = &MyProc->procLatch;
 	LWLockRelease(bdr_locks_ctl->lock);
 
-	elog(DEBUG1, "waiting for the other nodes acks");
+	elog(DEBUG2, "bdr: sent DDL lock request, waiting for confirmation");
 
 	while (true)
 	{
@@ -538,12 +565,16 @@ bdr_acquire_ddl_lock(void)
 	}
 
 	LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
+
 	/* TODO: recheck it's ours */
 	bdr_my_locks_database->acquire_confirmed = 0;
 	bdr_my_locks_database->acquire_declined = 0;
 	bdr_my_locks_database->waiting_latch = NULL;
 	bdr_my_locks_database->lockcount++;
 	this_xact_acquired_lock = true;
+
+	elog(DEBUG1, "bdr: global DDL lock acquired successfully by (" BDR_LOCALID_FORMAT ")", BDR_LOCALID_FORMAT_ARGS);
+
 	LWLockRelease(bdr_locks_ctl->lock);
 
 }
@@ -581,6 +612,8 @@ check_is_my_node(uint64 sysid, TimeLineID tli, Oid datid)
 
 /*
  * Another node has asked for a DDL lock. Try to acquire the local ddl lock.
+ *
+ * Runs in the apply worker.
  */
 void
 bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
@@ -596,6 +629,8 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 
 	bdr_locks_find_my_database(false);
 
+	elog(DEBUG2, "bdr: node (" BDR_LOCALID_FORMAT ") requested the global DDL lock", sysid, tli, datid, "");
+
 	initStringInfo(&s);
 
 	LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
@@ -613,6 +648,8 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		 * No previous DDL lock found. Start acquiring it.
 		 */
 
+		elog(DEBUG2, "bdr: no prior DDL lock found, acquiring local DDL lock");
+
 		/* Add a row to bdr_locks */
 		StartTransactionCommand();
 
@@ -648,12 +685,13 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 			ForceSyncCommit(); /* async commit would be too complicated */
 			heap_close(rel, NoLock);
 			CommitTransactionCommand();
+			elog(DEBUG3, "bdr: no conflicting global DDL lock found");
 		}
 		PG_CATCH();
 		{
 			if (geterrcode() == ERRCODE_UNIQUE_VIOLATION)
 			{
-				elog(DEBUG1, "declining DDL lock because a conflicting DDL lock exists in bdr_global_locks");
+				elog(DEBUG1, "bdr: declining DDL lock because a conflicting DDL lock exists in bdr_global_locks");
 				AbortOutOfAnyTransaction();
 				goto decline;
 			}
@@ -668,7 +706,7 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		LWLockRelease(bdr_locks_ctl->lock);
 
 		/*
-		 * Now kill all local processes that are still writing. We won't
+		 * Now kill all local processes that are still writing. We can't just
 		 * prevent them from writing via the acquired lock as they are still
 		 * running. We're using drastic measures here because it'd be a bad
 		 * idea to wait: The primary is waiting for us and during that time
@@ -678,6 +716,7 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		 * held conflicting locks, but that's not easiliy possible at the
 		 * moment.
 		 */
+		elog(DEBUG3, "bdr: terminating any local processes that conflict with the DDL lock");
 		conflicts = GetConflictingVirtualXIDs(InvalidTransactionId, MyDatabaseId);
 		while (conflicts->backendId != InvalidBackendId)
 		{
@@ -701,6 +740,8 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 				conflicts++;
 			else
 				pg_usleep(5000);
+
+			elog(DEBUG3, "bdr: signaled %d to terminate, conflicts with ddl lock requested by another node", p);
 		}
 
 		/*
@@ -709,8 +750,12 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		 * by all other nodes. When the required number of messages is back we
 		 * can confirm the lock to the original requestor
 		 * (c.f. bdr_process_replay_confirm()).
+		 *
+		 * If we didn't wait for everyone to replay local changes then a DDL
+		 * change that caused those local changes not to apply on remote
+		 * nodes might occur, causing a divergent conflict.
 		 */
-		elog(DEBUG1, "requesting replay confirmation from all other nodes");
+		elog(DEBUG2, "bdr: ddl locking requesting replay confirmation from all other nodes before confirming lock granted");
 
 		wait_for_lsn = GetXLogInsertRecPtr();
 		bdr_prepare_message(&s, BDR_MESSAGE_REQUEST_REPLAY_CONFIRM);
@@ -723,6 +768,8 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		bdr_my_locks_database->replay_confirmed = 0;
 		bdr_my_locks_database->replay_confirmed_lsn = wait_for_lsn;
 		LWLockRelease(bdr_locks_ctl->lock);
+
+		elog(DEBUG1, "bdr: ddl lock granted to remote node");
 	}
 	else
 	{
@@ -732,7 +779,7 @@ bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid)
 		LWLockRelease(bdr_locks_ctl->lock);
 decline:
 		ereport(LOG,
-				(errmsg("declining remote DDL lock request, already locked")));
+				(errmsg("bdr: declining remote DDL lock request, already locked")));
 		bdr_prepare_message(&s, BDR_MESSAGE_DECLINE_LOCK);
 
 		Assert(!IsTransactionState());
@@ -753,6 +800,11 @@ decline:
 	}
 }
 
+/*
+ * Another node has released the global DDL lock, update our local state.
+ *
+ * Runs in the apply worker.
+ */
 void
 bdr_process_release_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid origin_datid,
 							 uint64 lock_sysid, TimeLineID lock_tli, Oid lock_datid)
@@ -774,6 +826,8 @@ bdr_process_release_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 
 	initStringInfo(&s);
 
+	elog(DEBUG2, "bdr: node (" BDR_LOCALID_FORMAT ") released DDL lock", lock_sysid, lock_tli, lock_datid, "");
+
 	/*
 	 * Remove row from bdr_locks *before* releasing the in memory lock. If we
 	 * crash we'll replay the event again.
@@ -786,7 +840,7 @@ bdr_process_release_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 
 	while ((tuple = systable_getnext(scan)) != NULL)
 	{
-		elog(DEBUG1, "found ddl lock entry to delete in response to ddl lock release message");
+		elog(DEBUG3, "bdr: found DDL lock entry to delete in response to DDL lock release message");
 		simple_heap_delete(rel, &tuple->t_self);
 		ForceSyncCommit(); /* async commit would be too complicated */
 		found = true;
@@ -798,7 +852,11 @@ bdr_process_release_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 	CommitTransactionCommand();
 
 	if (!found)
-		elog(WARNING, "did not find local DDL lock entry about a remotely released lock");
+		ereport(WARNING,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("bdr: Did not find local DDL lock entry for a remotely released global DDL lock"),
+				 errdetail("node ("BDR_LOCALID_FORMAT") sent a release message but the lock isn't held locally",
+				 		   lock_sysid, lock_tli, lock_datid, "")));
 
 	LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
 	if (bdr_my_locks_database->lockcount > 0)
@@ -807,19 +865,25 @@ bdr_process_release_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 		bdr_my_locks_database->lock_holder = InvalidRepNodeId;
 	}
 	else
-		elog(WARNING, "releasing DDL lock without corresponding in-memory state");
+		/* This shouldn't happen; probable bug if reached */
+		elog(WARNING, "bdr: releasing DDL lock without corresponding in-memory state");
 
 	latch = bdr_my_locks_database->waiting_latch;
 	LWLockRelease(bdr_locks_ctl->lock);
 
+	elog(DEBUG1, "bdr: local DDL lock released");
+
 	/* notify an eventual waiter */
 	if(latch)
 		SetLatch(latch);
 }
 
 /*
- * Another node has confirmed a lock. Changed shared memory state and wakeup
- * the locker.
+ * Another node has confirmed that a node has acquired the DDL lock
+ * successfully. If the acquiring node was us, change shared memory state and
+ * wake up the user backend that was trying to acquire the lock.
+ *
+ * Runs in the apply worker.
  */
 void
 bdr_process_confirm_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid origin_datid,
@@ -840,8 +904,9 @@ bdr_process_confirm_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 	bdr_my_locks_database->acquire_confirmed++;
 	latch = bdr_my_locks_database->waiting_latch;
 
-	elog(DEBUG1, "received ddl lock confirmation number %d/%zu",
-		 bdr_my_locks_database->acquire_confirmed, bdr_my_locks_database->nnodes);
+	elog(DEBUG2, "bdr: received DDL lock confirmation number %d/%zu from ("BDR_LOCALID_FORMAT")",
+		 bdr_my_locks_database->acquire_confirmed, bdr_my_locks_database->nnodes,
+		 origin_sysid, origin_tli, origin_datid, "");
 	LWLockRelease(bdr_locks_ctl->lock);
 
 	if(latch)
@@ -849,8 +914,10 @@ bdr_process_confirm_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 }
 
 /*
- * Another node has declined a lock. Changed shared memory state and wakeup
- * the locker.
+ * Another node has declined a lock. If it was us, change shared memory state
+ * and wakeup the user backend that tried to acquire the lock.
+ *
+ * Runs in the apply worker.
  */
 void
 bdr_process_decline_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid origin_datid,
@@ -870,8 +937,17 @@ bdr_process_decline_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
 	LWLockRelease(bdr_locks_ctl->lock);
 	if(latch)
 		SetLatch(latch);
+
+	elog(DEBUG2, "bdr: node ("BDR_LOCALID_FORMAT") declined our DDL global lock request",
+		 origin_sysid, origin_tli, origin_datid, "");
 }
 
+/*
+ * Another node has asked us to confirm that we've replayed up to a given LSN.
+ * We've seen the request message, so send the requested confirmation.
+ *
+ * Runs in the walsender.
+ */
 void
 bdr_process_request_replay_confirm(uint64 sysid, TimeLineID tli,
 								   Oid datid, XLogRecPtr request_lsn)
@@ -884,13 +960,27 @@ bdr_process_request_replay_confirm(uint64 sysid, TimeLineID tli,
 
 	bdr_locks_find_my_database(false);
 
+	elog(DEBUG2, "bdr: node ("BDR_LOCALID_FORMAT") requested replay confirmation; sending",
+		 sysid, tli, datid, "");
+
 	initStringInfo(&s);
 	bdr_prepare_message(&s, BDR_MESSAGE_REPLAY_CONFIRM);
 	pq_sendint64(&s, request_lsn);
 	lsn = LogStandbyMessage(s.data, s.len, false);
 	XLogFlush(lsn);
+
 }
 
+/*
+ * A remote node has seen a replay confirmation request and replied to it.
+ *
+ * If we sent the original request, update local state appropriately.
+ *
+ * If a DDL lock request has reached quorum as a result of this confirmation,
+ * write a log acquisition confirmation and bdr_global_locks update to xlog.
+ *
+ * Runs in the apply worker.
+ */
 void
 bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 						   Oid datid, XLogRecPtr request_lsn)
@@ -903,7 +993,8 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 	bdr_locks_find_my_database(false);
 
 	LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
-	elog(DEBUG1, "processing replay confirmation for request %X/%X at %X/%X",
+	elog(DEBUG2, "bdr: processing replay confirmation from node ("BDR_LOCALID_FORMAT") for request %X/%X at %X/%X",
+		 sysid, tli, datid, "",
 		 (uint32)(bdr_my_locks_database->replay_confirmed_lsn >> 32),
 		 (uint32)bdr_my_locks_database->replay_confirmed_lsn,
 		 (uint32)(request_lsn >> 32),
@@ -914,7 +1005,7 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 	{
 		bdr_my_locks_database->replay_confirmed++;
 
-		elog(DEBUG1, "confirming replay %u/%zu",
+		elog(DEBUG2, "bdr: confirming replay %u/%zu",
 			 bdr_my_locks_database->replay_confirmed,
 			 bdr_my_locks_database->nnodes);
 
@@ -938,6 +1029,8 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 
 		initStringInfo(&s);
 
+		elog(DEBUG2, "bdr: DDL lock quorum reached, logging confirmation of this node's acquisition of global DDL lock");
+
 		/* clear out information about requested confirmations */
 		LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
 		bdr_my_locks_database->replay_confirmed = 0;
@@ -977,9 +1070,9 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 			bool		isnull[10];
 
 			if (found)
-				elog(PANIC, "duplicate lock?");
+				elog(PANIC, "bdr: duplicate lock?");
 
-			elog(DEBUG1, "updating DDL lock state from 'catchup' to 'acquired'");
+			elog(DEBUG1, "bdr: updating DDL lock state from 'catchup' to 'acquired'");
 
 			heap_deform_tuple(tuple, RelationGetDescr(rel),
 							  values, isnull);
@@ -994,7 +1087,7 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 		}
 
 		if (!found)
-			elog(PANIC, "unknown lock");
+			elog(PANIC, "bdr: got confirmation for unknown lock");
 
 		systable_endscan(scan);
 		UnregisterSnapshot(snap);
@@ -1002,7 +1095,7 @@ bdr_process_replay_confirm(uint64 sysid, TimeLineID tli,
 
 		CommitTransactionCommand();
 
-		elog(DEBUG1, "sending confirmation for DDL lock replay confirmation request");
+		elog(DEBUG2, "bdr: sent confirmation of successful DDL lock acquisition");
 	}
 }
 
@@ -1013,6 +1106,12 @@ bdr_locks_always_allow_writes(bool always_allow)
 	bdr_always_allow_writes = always_allow;
 }
 
+/*
+ * A remote node has sent a startup message. Update any appropriate local state
+ * like any locally held DDL locks for it.
+ *
+ * Runs in the apply worker.
+ */
 void
 bdr_locks_process_remote_startup(uint64 sysid, TimeLineID tli, Oid datid)
 {
@@ -1026,6 +1125,9 @@ bdr_locks_process_remote_startup(uint64 sysid, TimeLineID tli, Oid datid)
 
 	initStringInfo(&s);
 
+	elog(DEBUG2, "bdr: Got startup message from node ("BDR_LOCALID_FORMAT"), clearing any locks it held",
+		 sysid, tli, datid);
+
 	StartTransactionCommand();
 	snap = RegisterSnapshot(GetLatestSnapshot());
 	rel = heap_open(BdrLocksRelid, RowExclusiveLock);
@@ -1034,13 +1136,13 @@ bdr_locks_process_remote_startup(uint64 sysid, TimeLineID tli, Oid datid)
 
 	while ((tuple = systable_getnext(scan)) != NULL)
 	{
-		elog(DEBUG1, "found remote lock to delete (after remote restart)");
+		elog(DEBUG2, "bdr: found remote lock to delete (after remote restart)");
 
 		simple_heap_delete(rel, &tuple->t_self);
 		/* FIXME: locks */
 		LWLockAcquire(bdr_locks_ctl->lock, LW_EXCLUSIVE);
 		if (bdr_my_locks_database->lockcount == 0)
-			elog(WARNING, "bdr_global_locks row exists without corresponding in memory state");
+			elog(WARNING, "bdr: bdr_global_locks row exists without corresponding in memory state");
 		else
 		{
 			bdr_my_locks_database->lockcount--;
@@ -1057,6 +1159,12 @@ bdr_locks_process_remote_startup(uint64 sysid, TimeLineID tli, Oid datid)
 	CommitTransactionCommand();
 }
 
+/*
+ * The BDR ExecutorStart_hook that detects DDL and handles DDL lock acquision
+ * requests.
+ *
+ * Runs in user backends.
+ */
 static void
 BdrExecutorStart(QueryDesc *queryDesc, int eflags)
 {
@@ -1079,9 +1187,14 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
 
 	/* is the database still starting up and hasn't loaded locks */
 	if (!bdr_my_locks_database->locked_and_loaded)
+		/*
+		 * TODO: sleep here instead of ERRORing, and let statement_timeout
+		 * ERROR if appropriate
+		 */
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("database is not yet ready for writes"),
+				 errmsg("bdr: database is not yet ready for DDL operations"),
+				 errdetail("BDR DDL locking is still starting up"),
 				 errhint("Wait for a short time and retry.")));
 
 	/* Is this database locked against user initiated ddl? */
@@ -1089,7 +1202,7 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
 	if (bdr_my_locks_database->lockcount > 0 && !this_xact_acquired_lock)
 		ereport(ERROR,
 				(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
-				 errmsg("database is locked against writes"),
+				 errmsg("bdr: database is locked against DDL operations"),
 				 errhint("Some node in the cluster is performing DDL")));
 
 done:
-- 
2.39.5