bdr: Replace GUC-based connections with SQL and tables

author Craig Ringer <craig@2ndquadrant.com>

Sat, 7 Feb 2015 11:54:17 +0000 (00:54 +1300)

committer Andres Freund <andres@anarazel.de>

Thu, 12 Feb 2015 09:16:58 +0000 (10:16 +0100)
author Craig Ringer <craig@2ndquadrant.com>
Sat, 7 Feb 2015 11:54:17 +0000 (00:54 +1300)
committer Andres Freund <andres@anarazel.de>
Thu, 12 Feb 2015 09:16:58 +0000 (10:16 +0100)
diff --git a/Makefile.in b/Makefile.in

index e92e62f18ecd076948d5dd81898d03b9233c6e2d..318c063f1be03956fba13b3087a37030234346fa 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -20,7 +20,8 @@ DATA = \
     extsql/bdr--0.8.0.4--0.8.0.5.sql \
     extsql/bdr--0.8.0.5--0.8.0.6.sql \
     extsql/bdr--0.8.0.6--0.8.0.7.sql \
-   extsql/bdr--0.8.0.7--0.9.0.0.sql
+   extsql/bdr--0.8.0.7--0.9.0.0.sql \
+   extsql/bdr--0.9.0.0--0.9.0.1.sql
  
  DATA_built = \
     extsql/bdr--0.8.0.1.sql \
@@ -30,7 +31,8 @@ DATA_built = \
     extsql/bdr--0.8.0.5.sql \
     extsql/bdr--0.8.0.6.sql \
     extsql/bdr--0.8.0.7.sql \
-   extsql/bdr--0.9.0.0.sql
+   extsql/bdr--0.9.0.0.sql \
+   extsql/bdr--0.9.0.1.sql
  
  DOCS = bdr.conf.sample README.bdr
  SCRIPTS = scripts/bdr_initial_load bdr_init_copy bdr_resetxlog bdr_dump
@@ -47,6 +49,7 @@ OBJS = \
     bdr_conflict_handlers.o \
     bdr_conflict_logging.o \
     bdr_commandfilter.o \
+   bdr_common.o \
     bdr_compat.o \
     bdr_count.o \
     bdr_executor.o \
@@ -55,7 +58,9 @@ OBJS = \
     bdr_locks.o \
     bdr_output.o \
     bdr_relcache.o \
-   bdr_remotecalls.o
+   bdr_remotecalls.o \
+   bdr_supervisor.o \
+   bdr_upgrade.o
  
  ifeq "@BUILDING_BDR@" "1"
  OBJS += \
@@ -91,7 +96,6 @@ bdr_init_copy: bdr_init_copy.o
     $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(libpq_pgport) $(LIBS) -o $@$(X)
  
  scripts/bdr_initial_load: scripts/bdr_initial_load.in
-   mkdir -p scripts
     sed -e "s/BDR_VERSION/$(BDR_VERSION)/" -e "s/PG_VERSION/$(VERSION)/" $< > $@
  
  extsql/bdr--0.8.0.1.sql: extsql/bdr--0.8.0.sql extsql/bdr--0.8.0--0.8.0.1.sql
@@ -124,6 +128,11 @@ extsql/bdr--0.8.0.7.sql: extsql/bdr--0.8.0.6.sql extsql/bdr--0.8.0.6--0.8.0.7.sq
     cat $^ > $@
  
  extsql/bdr--0.9.0.0.sql: extsql/bdr--0.8.0.7.sql extsql/bdr--0.8.0.7--0.9.0.0.sql
+   mkdir -p extsql
+   cat $^ > $@
+
+extsql/bdr--0.9.0.1.sql: extsql/bdr--0.9.0.0.sql extsql/bdr--0.9.0.0--0.9.0.1.sql
+   mkdir -p extsql
     cat $^ > $@
  
  bdr_resetxlog: pg_resetxlog.o
@@ -175,13 +184,17 @@ check: regresscheck isolationcheck
  DDLREGRESSCHECKS=ddl/create ddl/alter_table ddl/extension ddl/function \
                  ddl/grant ddl/mixed ddl/namespace ddl/replication_set \
                  ddl/sequence ddl/view
+REGRESSINIT=init_bdr
  else
  check: regresscheck
  DDLREGRESSCHECKS=
+REGRESSINIT=init_udr
  endif
  
+
  REGRESSCHECKS= \
     init \
+   $(REGRESSINIT) \
     upgrade \
     identifier \
     $(DDLREGRESSCHECKS) \
@@ -189,7 +202,7 @@ REGRESSCHECKS= \
  
  
  ISOLATIONCHECKS=\
-   isolation/waitforstart \
+   isolation/init \
     isolation/ddlconflict \
     isolation/dmlconflict_ii \
     isolation/dmlconflict_uu \
diff --git a/bdr.c b/bdr.c

index 276fa1c705a98d969985dc9b7bcc63071596fc9a..8090d57760bbbd8b07e03e74e73a7cc3895f96e1 100644 (file)
--- a/bdr.c
+++ b/bdr.c
@@ -71,23 +71,18 @@ extern Oid          origin_dboid;
  /* end externs for bdr apply state */
  
  ResourceOwner bdr_saved_resowner;
-static bool bdr_is_restart = false;
  Oid   BdrNodesRelid;
  Oid   BdrConflictHistoryRelId;
  Oid   BdrLocksRelid;
  Oid   BdrLocksByOwnerRelid;
  Oid   BdrReplicationSetConfigRelid;
  
-BdrConnectionConfig  **bdr_connection_configs;
-/* All databases for which BDR is configured, valid after _PG_init */
-char **bdr_distinct_dbnames;
-uint32 bdr_distinct_dbnames_count = 0;
-
  /* GUC storage */
  static char *connections = NULL;
  static bool bdr_synchronous_commit;
  int bdr_default_apply_delay;
  int bdr_max_workers;
+int bdr_max_databases;
  static bool bdr_skip_ddl_replication;
  bool bdr_skip_ddl_locking;
  bool bdr_do_not_replicate;
@@ -101,11 +96,17 @@ BdrWorkerType bdr_worker_type = BDR_WORKER_EMPTY_SLOT;
  /* shortcut for finding the the worker shmem block */
  BdrWorkerControl *BdrWorkerCtl = NULL;
  
+/* This worker's block within BdrWorkerCtl - only valid in bdr workers */
+BdrWorker  *bdr_worker_slot = NULL;
+
+/* Worker generation number; see bdr_worker_shmem_startup comments */
+static uint16 bdr_worker_generation;
+
+
  PG_MODULE_MAGIC;
  
  void       _PG_init(void);
  static void bdr_worker_shmem_startup(void);
-static void bdr_worker_shmem_create_workers(void);
  
  PGDLLEXPORT Datum bdr_apply_pause(PG_FUNCTION_ARGS);
  PGDLLEXPORT Datum bdr_apply_resume(PG_FUNCTION_ARGS);
@@ -123,7 +124,7 @@ PG_FUNCTION_INFO_V1(bdr_min_remote_version_num);
  PG_FUNCTION_INFO_V1(bdr_variant);
  PG_FUNCTION_INFO_V1(bdr_get_local_nodeid);
  
-static void
+void
  bdr_sigterm(SIGNAL_ARGS)
  {
     int         save_errno = errno;
@@ -144,7 +145,7 @@ bdr_sigterm(SIGNAL_ARGS)
     errno = save_errno;
  }
  
-static void
+void
  bdr_sighup(SIGNAL_ARGS)
  {
     int         save_errno = errno;
@@ -212,7 +213,7 @@ bdr_get_remote_dboid(const char *conninfo_db)
   *
   * The replication identifier is allocated in the current memory context.
   */
-void
+static void
  bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
         Oid remote_dboid, char **out_replication_identifier,
         Name out_slot_name)
@@ -438,7 +439,7 @@ bdr_worker_init(char *dbname)
     /* make sure BDR extension exists */
     bdr_executor_always_allow_writes(true);
     StartTransactionCommand();
-   bdr_maintain_schema();
+   bdr_maintain_schema(true);
     CommitTransactionCommand();
     bdr_executor_always_allow_writes(false);
  
@@ -570,215 +571,6 @@ bdr_establish_connection_and_slot(const char *dsn,
     return streamConn;
  }
  
-/*
- * In postmaster, at shared_preload_libaries time, create the GUCs for a
- * connection. They'll be accessed by the apply worker that uses these GUCs
- * later.
- *
- * Returns false if the config wasn't created for some reason (missing
- * required options, etc); true if it's ok. Out parameters are not changed if
- * false is returned.
- *
- * Params:
- *
- *  name
- *  Name of this conn - bdr.<name>
- *
- *  used_databases
- *  Array of char*, names of distinct databases named in configured conns
- *
- *  num_used_databases
- *  Number of distinct databases named in conns
- *
- * out_config
- *  Assigned a palloc'd pointer to GUC storage for this config'd connection
- *
- * out_config is set even if false is returned, as the GUCs have still been
- * created. Test out_config->is_valid to see whether the connection is usable.
- */
-static bool
-bdr_create_con_gucs(char  *name,
-                   char **used_databases,
-                   Size  *num_used_databases,
-                   char **database_initcons,
-                   BdrConnectionConfig **out_config)
-{
-   Size        off;
-   char       *errormsg = NULL;
-   PQconninfoOption *options;
-   PQconninfoOption *cur_option;
-   BdrConnectionConfig *opts;
-
-   /* don't free, referenced by the guc machinery! */
-   char       *optname_dsn = palloc(strlen(name) + 30);
-   char       *optname_delay = palloc(strlen(name) + 30);
-   char       *optname_replica = palloc(strlen(name) + 30);
-   char       *optname_local_dsn = palloc(strlen(name) + 30);
-   char       *optname_local_dbname = palloc(strlen(name) + 30);
-   char       *optname_replication_sets = palloc(strlen(name) + 30);
-
-   Assert(process_shared_preload_libraries_in_progress);
-
-   /* Ensure the connection name is legal */
-   if (strchr(name, '_') != NULL)
-   {
-       ereport(ERROR,
-               (errmsg("bdr.connections entry '%s' contains the '_' character, which is not permitted", name)));
-   }
-
-   /* allocate storage for connection parameters */
-   opts = palloc0(sizeof(BdrConnectionConfig));
-   opts->is_valid = false;
-   *out_config = opts;
-
-   opts->name = pstrdup(name);
-
-   /* Define GUCs for this connection */
-   sprintf(optname_dsn, "bdr.%s_dsn", name);
-   DefineCustomStringVariable(optname_dsn,
-                              optname_dsn,
-                              NULL,
-                              &opts->dsn,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_delay, "bdr.%s_apply_delay", name);
-   DefineCustomIntVariable(optname_delay,
-                           optname_delay,
-                           NULL,
-                           &opts->apply_delay,
-                           -1, -1, INT_MAX,
-                           PGC_SIGHUP,
-                           GUC_UNIT_MS,
-                           NULL, NULL, NULL);
-
-   sprintf(optname_replica, "bdr.%s_init_replica", name);
-   DefineCustomBoolVariable(optname_replica,
-                            optname_replica,
-                            NULL,
-                            &opts->init_replica,
-                            false,
-                            PGC_SIGHUP,
-                            0,
-                            NULL, NULL, NULL);
-
-   sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
-   DefineCustomStringVariable(optname_local_dsn,
-                              optname_local_dsn,
-                              NULL,
-                              &opts->replica_local_dsn,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
-   DefineCustomStringVariable(optname_local_dbname,
-                              optname_local_dbname,
-                              NULL,
-                              &opts->dbname,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_replication_sets, "bdr.%s_replication_sets", name);
-   DefineCustomStringVariable(optname_replication_sets,
-                              optname_replication_sets,
-                              NULL,
-                              &opts->replication_sets,
-                              NULL, PGC_POSTMASTER,
-                              GUC_LIST_INPUT | GUC_LIST_QUOTE,
-                              NULL, NULL, NULL);
-
-
-   if (!opts->dsn)
-   {
-       elog(WARNING, "bdr %s: no connection information", name);
-       return false;
-   }
-
-   elog(DEBUG2, "bdr %s: dsn=%s", name, opts->dsn);
-
-   options = PQconninfoParse(opts->dsn, &errormsg);
-   if (errormsg != NULL)
-   {
-       char       *str = pstrdup(errormsg);
-
-       PQfreemem(errormsg);
-       ereport(ERROR,
-               (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                errmsg("bdr %s: error in dsn: %s", name, str)));
-   }
-
-   if (opts->dbname == NULL)
-   {
-       cur_option = options;
-       while (cur_option->keyword != NULL)
-       {
-           if (strcmp(cur_option->keyword, "dbname") == 0)
-           {
-               if (cur_option->val == NULL)
-                   ereport(ERROR,
-                           (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                            errmsg("bdr %s: no dbname set", name)));
-
-               opts->dbname = pstrdup(cur_option->val);
-               elog(DEBUG2, "bdr %s: dbname=%s", name, opts->dbname);
-           }
-
-           if (cur_option->val != NULL)
-           {
-               elog(DEBUG3, "bdr %s: opt %s, val: %s",
-                    name, cur_option->keyword, cur_option->val);
-           }
-           cur_option++;
-       }
-   }
-
-   /* cleanup */
-   PQconninfoFree(options);
-
-   /*
-    * If this is a DB name we haven't seen yet, add it to our set of known
-    * DBs.
-    */
-   for (off = 0; off < *num_used_databases; off++)
-   {
-       if (strcmp(opts->dbname, used_databases[off]) == 0)
-           break;
-   }
-
-   if (off == *num_used_databases)
-   {
-       /* Didn't find a match, add new db name */
-       used_databases[(*num_used_databases)++] =
-           pstrdup(opts->dbname);
-       elog(DEBUG2, "bdr %s: Saw new database %s, now %i known dbs",
-            name, opts->dbname, (int)(*num_used_databases));
-   }
-
-   /*
-    * Make sure that at most one of the worker configs for each DB can be
-    * configured to run initialization.
-    */
-   if (opts->init_replica)
-   {
-       elog(DEBUG2, "bdr %s: has init_replica=t", name);
-       if (database_initcons[off] != NULL)
-           ereport(ERROR,
-                   (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                    errmsg("Connections %s and %s on database %s both have bdr_init_replica enabled, cannot continue",
-                           name, database_initcons[off], used_databases[off])));
-       else
-           database_initcons[off] = name; /* no need to pstrdup, see _PG_init */
-   }
-
-   opts->is_valid = true;
-
-   /* optname vars intentionally leaked, see above */
-   return true;
-}
-
  static size_t
  bdr_worker_shmem_size()
  {
@@ -846,16 +638,35 @@ bdr_worker_shmem_startup(void)
         /* Init shm segment header after postmaster start or restart */
         memset(BdrWorkerCtl, 0, bdr_worker_shmem_size());
         BdrWorkerCtl->lock = LWLockAssign();
+       /* Assigned on supervisor launch */
+       BdrWorkerCtl->supervisor_latch = NULL;
  
         /*
-        * Now that the shm segment is initialized, we can populate it with
-        * BdrWorker entries for the connections we created GUCs for during
-        * _PG_init.
+        * The postmaster keeps track of a generation number for BDR workers
+        * and increments it at each restart.
+        *
+        * Background workers aren't unregistered when the postmaster restarts
+        * and clears shared memory, so after a restart the supervisor and
+        * per-db workers have no idea what workers are/aren't running, nor any
+        * way to control them. To make a clean BDR restart possible the
+        * workers registered before the restart need to find out about the
+        * restart and terminate.
+        *
+        * To make that possible we pass the generation number to the worker
+        * in its main argument, and also set it in shared memory. The two
+        * must match. If they don't, the worker will proc_exit(0), causing its
+        * self to be unregistered.
          *
-        * We must do this whether it's initial launch or a postmaster restart,
-        * as shmem gets cleared on postmaster restart.
+        * This should really be part of the bgworker API its self, handled via
+        * a BGW_NO_RESTART_ON_CRASH flag or by providing a generation number
+        * as a bgworker argument. However, for now we're stuck with this
+        * workaround.
          */
-       bdr_worker_shmem_create_workers();
+       if (bdr_worker_generation == UINT16_MAX)
+           /* We could handle wrap-around, but really ... */
+           elog(FATAL, "Too many postmaster crash/restart cycles. Restart the PostgreSQL server.");
+
+       BdrWorkerCtl->worker_generation = ++bdr_worker_generation;
     }
     LWLockRelease(AddinShmemInitLock);
  
@@ -865,145 +676,6 @@ bdr_worker_shmem_startup(void)
      */
  }
  
-/*
- * After _PG_init we've read the GUCs for the workers but haven't populated the
- * shared memory segment at BdrWorkerCtl with BDRWorker entries yet.
- *
- * The shm segment is initialized now, so do that.
- */
-static void
-bdr_worker_shmem_create_workers(void)
-{
-   uint32 off;
-
-   /*
-    * Create a BdrPerdbWorker for each distinct database found during
-    * _PG_init. The bgworker for each has already been registered and assigned
-    * a slot position during _PG_init, but the slot doesn't have anything
-    * useful in it yet. Because it was already registered we don't need
-    * any protection against duplicate launches on restart here.
-    *
-    * Because these slots are pre-assigned before shmem is bought up they
-    * MUST be reserved first, before any shmem entries are allocated, so
-    * they get the first slots.
-    *
-    * When started, this worker will continue setup - doing any required
-    * initialization of the database, then registering dynamic bgworkers for
-    * the DB's individual BDR connections.
-    *
-    * If we ever want to support dynamically adding/removing DBs from BDR at
-    * runtime, this'll need to move into a static bgworker because dynamic
-    * bgworkers can't be launched directly from the postmaster. We'll need a
-    * "bdr manager" static bgworker.
-    */
-
-   for (off = 0; off < bdr_distinct_dbnames_count; off++)
-   {
-       BdrWorker      *shmworker;
-       BdrPerdbWorker *perdb;
-       uint32      ctl_idx;
-
-       shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_PERDB, &ctl_idx);
-       Assert(shmworker->worker_type == BDR_WORKER_PERDB);
-       /*
-        * The workers have already been assigned shmem indexes during
-        * _PG_init, so they MUST get the same index here. So long as these
-        * entries are assigned before any other shmem slots they will.
-        */
-       Assert(ctl_idx == off);
-       perdb = &shmworker->data.perdb;
-
-       strncpy(NameStr(perdb->dbname), bdr_distinct_dbnames[off], NAMEDATALEN);
-       NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
-
-       perdb->nnodes = 0;
-       perdb->seq_slot = off;
-
-       elog(DEBUG1, "Assigning shmem bdr database worker for db %s",
-            NameStr(perdb->dbname));
-   }
-
-   /*
-    * Populate shmem with a BdrApplyWorker for each valid BdrConnectionConfig
-    * found during _PG_init so that the per-db worker will register it for
-    * startup after performing any BDR initialisation work.
-    *
-    * Use of shared memory for this is required for EXEC_BACKEND (windows)
-    * where we can't share postmaster memory, and for when we're launching a
-    * bgworker from another bgworker where the fork() from postmaster doesn't
-    * provide access to the launching bgworker's memory.
-    *
-    * The workers aren't actually launched here, they get launched by
-    * launch_apply_workers(), called by the database's per-db static worker.
-    */
-   for (off = 0; off < bdr_max_workers; off++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[off];
-       BdrWorker      *shmworker;
-       BdrApplyWorker *worker;
-       int             i;
-       bool            found_perdb = false;
-
-       if (cfg == NULL || !cfg->is_valid)
-           continue;
-
-       shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_APPLY, NULL);
-       Assert(shmworker->worker_type == BDR_WORKER_APPLY);
-       worker = &shmworker->data.apply;
-       worker->connection_config_idx = off;
-       worker->replay_stop_lsn = InvalidXLogRecPtr;
-       worker->forward_changesets = false;
-
-       /*
-        * Now search for the perdb worker belonging to this slot.
-        */
-       for (i = 0; i < bdr_max_workers; i++)
-       {
-           BdrPerdbWorker *perdb;
-           BdrWorker *entry = &BdrWorkerCtl->slots[i];
-
-           if (entry->worker_type != BDR_WORKER_PERDB)
-               continue;
-
-           perdb = &entry->data.perdb;
-
-           if (strcmp(NameStr(perdb->dbname), cfg->dbname) != 0)
-               continue;
-
-           /*
-            * Remember how many connections there are for this node. This
-            * will, e.g., be used to determine the quorum for ddl locks and
-            * sequencer votes.
-            */
-           perdb->nnodes++;
-           found_perdb = true;
-           worker->perdb_worker_off = i;
-           break;
-       }
-
-       if (!found_perdb)
-           elog(ERROR, "couldn't find perdb entry for apply worker");
-
-       /*
-        * If this is a postmaster restart, don't register the worker a second
-        * time when the per-db worker starts up.
-        */
-       worker->bgw_is_registered = bdr_is_restart;
-   }
-
-   /*
-    * Make sure that we don't register workers if the postmaster restarts and
-    * clears shmem, by keeping a record that we've asked for registration once
-    * already.
-    */
-   bdr_is_restart = true;
-
-   /*
-    * We might need to re-populate shared memory after a postmaster restart.
-    * So we don't free the bdr_startup_context or its contents.
-    */
-}
-
  
  /*
   * Allocate a block from the bdr_worker shm segment in BdrWorkerCtl, or ERROR
@@ -1014,12 +686,16 @@ bdr_worker_shmem_create_workers(void)
   * ctl_idx, if passed, is set to the index of the worker within BdrWorkerCtl.
   *
   * To release a block, use bdr_worker_shmem_release(...)
+ *
+ * You must hold BdrWorkerCtl->lock in LW_EXCLUSIVE mode for
+ * this call.
   */
  BdrWorker*
  bdr_worker_shmem_alloc(BdrWorkerType worker_type, uint32 *ctl_idx)
  {
     int i;
-   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
     for (i = 0; i < bdr_max_workers; i++)
     {
         BdrWorker *new_entry = &BdrWorkerCtl->slots[i];
@@ -1027,13 +703,11 @@ bdr_worker_shmem_alloc(BdrWorkerType worker_type, uint32 *ctl_idx)
         {
             memset(new_entry, 0, sizeof(BdrWorker));
             new_entry->worker_type = worker_type;
-           LWLockRelease(BdrWorkerCtl->lock);
             if (ctl_idx)
                 *ctl_idx = i;
             return new_entry;
         }
     }
-   LWLockRelease(BdrWorkerCtl->lock);
     ereport(ERROR,
             (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
             errmsg("No free bdr worker slots - bdr.max_workers is too low")));
@@ -1127,17 +801,7 @@ bdr_do_not_replicate_assign_hook(bool newvalue, void *extra)
  void
  _PG_init(void)
  {
-   List       *connames;
-   ListCell   *c;
     MemoryContext old_context;
-   char       *connections_tmp;
-
-   char      **used_databases;
-   char      **database_initcons;
-   Size        num_used_databases = 0;
-   int         connection_config_idx;
-   BackgroundWorker bgw;
-   uint32      off;
  
     if (!process_shared_preload_libraries_in_progress)
         ereport(ERROR,
@@ -1151,6 +815,15 @@ _PG_init(void)
                  errmsg("bdr requires \"track_commit_timestamp\" to be enabled")));
  #endif
  
+   /*
+    * _PG_init only runs on first load, not on postmaster restart, so
+    * set the worker generation here. See bdr_worker_shmem_startup.
+    *
+    * It starts at 1 because the postmaster zeroes shmem on restart, so 0 can
+    * mean "just restarted, hasn't run shmem setup callback yet".
+    */
+   bdr_worker_generation = 1;
+
     /*
      * Force btree_gist to be loaded - its absolutely not required at this
      * point, but since it's required for BDR to be used it's much easier to
@@ -1213,10 +886,19 @@ _PG_init(void)
      * memory array.
      */
     DefineCustomIntVariable("bdr.max_workers",
-                           "max number of bdr connections + distinct databases. -1 auto-calculates.",
+                           "max number of bdr connections + distinct databases.",
                             NULL,
                             &bdr_max_workers,
-                           -1, -1, 100,
+                           20, 2, 100,
+                           PGC_POSTMASTER,
+                           0,
+                           NULL, NULL, NULL);
+
+   DefineCustomIntVariable("bdr.max_databases",
+                           "max number of distinct databases on which BDR may be active",
+                           NULL,
+                           &bdr_max_databases,
+                           -1, -1, 50,
                             PGC_POSTMASTER,
                             0,
                             NULL, NULL, NULL);
@@ -1271,15 +953,6 @@ _PG_init(void)
                                0,
                                NULL, NULL, NULL);
  
-   DefineCustomBoolVariable("bdr.init_from_basedump",
-                            "Internal. Set during local initialization from basebackup only",
-                            NULL,
-                            &bdr_init_from_basedump,
-                            false,
-                            PGC_BACKEND,
-                            0,
-                            NULL, NULL, NULL);
-
     DefineCustomBoolVariable("bdr.do_not_replicate",
                              "Internal. Set during local initialization from basebackup only",
                              NULL,
@@ -1293,40 +966,7 @@ _PG_init(void)
  
     bdr_label_init();
  
-   /* if nothing is configured, we're done */
-   if (connections == NULL)
-   {
-       /* If worker count autoconfigured, use zero */
-       if (bdr_max_workers == -1)
-           bdr_max_workers = 0;
-       goto out;
-   }
-
-   /* Copy 'connections' guc so SplitIdentifierString can modify it in-place */
-   connections_tmp = pstrdup(connections);
-
-   /* Get the list of BDR connection names to iterate over. */
-   if (!SplitIdentifierString(connections_tmp, ',', &connames))
-   {
-       /* syntax error in list */
-       ereport(FATAL,
-               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                errmsg("invalid list syntax for \"bdr.connections\"")));
-   }
-
-   /*
-    * If bdr.max_connections is -1, the default, auto-set it with the
-    * most workers we might need with the current number of connections
-    * configured. Per-db workers are due to use shmem too, so we might
-    * have up to one per-db worker for each configured connection if
-    * each is on a different DB.
-    */
-   if (bdr_max_workers == -1)
-   {
-       bdr_max_workers = list_length(connames) * 3;
-       elog(DEBUG1, "bdr: bdr_max_workers unset, configuring for %d workers",
-               bdr_max_workers);
-   }
+   bdr_supervisor_register();
  
     /*
      * Sanity check max_worker_processes to make sure it's at least big enough
@@ -1342,6 +982,17 @@ _PG_init(void)
                  errhint("Set max_worker_processes to at least %d", bdr_max_workers)));
     }
  
+   /*
+    * If bdr.max_databases is not explicitly specified, assume the worst case
+    * of many DBs with one connection per DB.
+    */
+   if (bdr_max_databases == -1)
+   {
+       bdr_max_databases = bdr_max_workers / 2;
+       elog(DEBUG1, "Autoconfiguring bdr.max_databases to %d (bdr.max_workers/2)",
+            bdr_max_databases);
+   }
+
     /*
      * Allocate a shared memory segment to store the bgworker connection
      * information we must pass to each worker we launch.
@@ -1352,110 +1003,19 @@ _PG_init(void)
      */
     bdr_worker_alloc_shmem_segment();
  
-   /* Allocate space for BDR connection GUCs */
-   bdr_connection_configs = (BdrConnectionConfig**)
-       palloc0(bdr_max_workers * sizeof(BdrConnectionConfig*));
-
-   /* Names of all databases we're going to be doing BDR for */
-   used_databases = palloc0(sizeof(char *) * list_length(connames));
-   /*
-    * For each db named in used_databases, the corresponding index is the name
-    * of the conn with bdr_init_replica=t if any.
-    */
-   database_initcons = palloc0(sizeof(char *) * list_length(connames));
-
-   /*
-    * Read all connections, create/validate parameters for them and do sanity
-    * checks as we go.
-    */
-   connection_config_idx = 0;
-   foreach(c, connames)
-   {
-       char           *name;
-       name = (char *) lfirst(c);
-
-       if (!bdr_create_con_gucs(name, used_databases, &num_used_databases,
-                                database_initcons,
-                                &bdr_connection_configs[connection_config_idx]))
-           continue;
-
-       Assert(bdr_connection_configs[connection_config_idx] != NULL);
-       connection_config_idx++;
-   }
-
-   /*
-    * Free the connames list cells. The strings are just pointers into
-    * 'connections' and must not be freed'd.
-    */
-   list_free(connames);
-   connames = NIL;
-
-   /*
-    * We've ensured there are no duplicate init connections, no need to
-    * remember which conn is the bdr_init_replica conn anymore. The contents
-    * are just pointers into connections_tmp so we don't want to free them.
-    */
-   pfree(database_initcons);
-
-   /*
-    * Copy the list of used databases into a global where we can
-    * use it for registering the per-database workers during shmem init.
-    */
-   bdr_distinct_dbnames = palloc(sizeof(char*)*num_used_databases);
-   memcpy(bdr_distinct_dbnames, used_databases,
-          sizeof(char*)*num_used_databases);
-   bdr_distinct_dbnames_count = num_used_databases;
-   pfree(used_databases);
-   num_used_databases = 0;
-   used_databases = NULL;
-
-   /*
-    * Register the per-db workers and assign them an index in shmem. The
-    * memory doesn't actually exist yet, it'll be allocated in shmem init.
-    *
-    * No protection against multiple launches is requried because this
-    * only runs once, in _PG_init.
-    */
-   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
-       BGWORKER_BACKEND_DATABASE_CONNECTION;
-   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-   bgw.bgw_main = NULL;
-   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
-   strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
-   bgw.bgw_restart_time = 5;
-   bgw.bgw_notify_pid = 0;
-   for (off = 0; off < bdr_distinct_dbnames_count; off++)
-   {
-       snprintf(bgw.bgw_name, BGW_MAXLEN,
-                "bdr: %s", bdr_distinct_dbnames[off]);
-       /*
-        * This index into BdrWorkerCtl shmem hasn't been populated yet. It'll
-        * be set up in bdr_worker_shmem_create_workers .
-        */
-       bgw.bgw_main_arg = Int32GetDatum(off);
-       RegisterBackgroundWorker(&bgw);
-   }
-
     EmitWarningsOnPlaceholders("bdr");
  
-   pfree(connections_tmp);
-
-out:
-
     /*
      * initialize other modules that need shared memory
-    *
-    * Do so even if we haven't any remote nodes setup, the shared memory might
-    * still be needed for some sql callable functions or such.
      */
  
     /* register a slot for every remote node */
     bdr_count_shmem_init(bdr_max_workers);
     bdr_executor_init();
  #ifdef BUILDING_BDR
-   bdr_sequencer_shmem_init(bdr_max_workers, bdr_distinct_dbnames_count);
+   bdr_sequencer_shmem_init(bdr_max_workers, bdr_max_databases);
  #endif
-   bdr_locks_shmem_init(bdr_distinct_dbnames_count);
+   bdr_locks_shmem_init();
     /* Set up a ProcessUtility_hook to stop unsupported commands being run */
     init_bdr_commandfilter();
  
@@ -1483,9 +1043,12 @@ bdr_lookup_relid(const char *relname, Oid schema_oid)
   * Concurrent executions will block, but not fail.
   *
   * Must be called inside transaction.
+ *
+ * If update_extensions is true, ALTER EXTENSION commands will be issued to
+ * ensure the required extension(s) are at the current version.
   */
  void
-bdr_maintain_schema(void)
+bdr_maintain_schema(bool update_extensions)
  {
     Relation    extrel;
     Oid         btree_gist_oid;
@@ -1504,17 +1067,13 @@ bdr_maintain_schema(void)
     btree_gist_oid = get_extension_oid("btree_gist", true);
     bdr_oid = get_extension_oid("bdr", true);
  
-   /* create required extension if they don't exists yet */
     if (btree_gist_oid == InvalidOid)
-   {
-       CreateExtensionStmt create_stmt;
+       elog(ERROR, "btree_gist is required by BDR but not installed in the current database");
  
-       create_stmt.if_not_exists = false;
-       create_stmt.options = NIL;
-       create_stmt.extname = (char *)"btree_gist";
-       CreateExtension(&create_stmt);
-   }
-   else
+   if (bdr_oid == InvalidOid)
+       elog(ERROR, "bdr extension is not installed in the current database");
+
+   if (update_extensions)
     {
         AlterExtensionStmt alter_stmt;
  
@@ -1522,20 +1081,6 @@ bdr_maintain_schema(void)
         alter_stmt.options = NIL;
         alter_stmt.extname = (char *)"btree_gist";
         ExecAlterExtensionStmt(&alter_stmt);
-   }
-
-   if (bdr_oid == InvalidOid)
-   {
-       CreateExtensionStmt create_stmt;
-
-       create_stmt.if_not_exists = false;
-       create_stmt.options = NIL;
-       create_stmt.extname = (char *)"bdr";
-       CreateExtension(&create_stmt);
-   }
-   else
-   {
-       AlterExtensionStmt alter_stmt;
  
         /* TODO: only do this if necessary */
         alter_stmt.options = NIL;
diff --git a/bdr.control b/bdr.control

index 8673e3976b2f0ea55967216fb26c8769b419e1ed..b3faa319e2cc5dafa75683be3ff66bff7954b758 100644 (file)
--- a/bdr.control
+++ b/bdr.control
@@ -1,6 +1,6 @@
  # bdr extension
  comment = 'Bi-directional replication for PostgreSQL'
-default_version = '0.9.0.0'
+default_version = '0.9.0.1'
  module_pathname = '$libdir/bdr'
  relocatable = false
  requires = btree_gist
diff --git a/bdr.h b/bdr.h

index 566a77e7f48075905173edf9a930f834d642aeb0..12440f2ff855963e728558fb7bf8ba4073f4a899 100644 (file)
--- a/bdr.h
+++ b/bdr.h
@@ -14,6 +14,7 @@
  #include "postmaster/bgworker.h"
  #include "replication/logical.h"
  #include "utils/resowner.h"
+#include "storage/latch.h"
  #include "storage/lock.h"
  
  #include "libpq-fe.h"
@@ -154,11 +155,16 @@ typedef struct BDRTupleData
   */
  typedef struct BdrApplyWorker
  {
+   /* oid of the database this worker is applying changes to */
+   Oid dboid;
+
     /*
-    * Index in bdr_connection_configs of this workers's GUCs
-    * and config info (including dbname, name, etc).
+    * Identification for the remote db we're connecting to; used to
+    * find the appropriate bdr.connections row, etc.
      */
-   int connection_config_idx;
+   uint64      remote_sysid;
+   TimeLineID  remote_timeline;
+   Oid         remote_dboid;
  
     /*
      * If not InvalidXLogRecPtr, stop replay at this point and exit.
@@ -170,15 +176,6 @@ typedef struct BdrApplyWorker
  
     /* Request that the remote forward all changes from other nodes */
     bool forward_changesets;
-
-   /*
-    * Ensure this worker doesn't get registered a second time if there's a
-    * perdb worker restart or postmaster restart. Ideally we'd store the
-    * BackgroundWorkerHandle, but it's an opaque struct.
-    */
-   bool bgw_is_registered;
-
-   size_t perdb_worker_off;
  } BdrApplyWorker;
  
  /*
@@ -187,18 +184,26 @@ typedef struct BdrApplyWorker
   */
  typedef struct BdrPerdbWorker
  {
-   /* local database name */
+   /* local database name to connect to */
     NameData dbname;
  
     /* number of outgoing connections from this database */
-   size_t nnodes;
+   Size nnodes;
  
     size_t seq_slot;
  
+   /* The perdb worker's latch from the PROC array, for use from other backends */
+   Latch      *proclatch;
+
+   /* Oid of the database the worker is attached to - populated after start */
+   Oid database_oid;
  } BdrPerdbWorker;
  
  /*
   * Type of BDR worker in a BdrWorker struct
+ *
+ * Note that the supervisor worker doesn't appear here, it has its own
+ * dedicated entry in the shmem segment.
   */
  typedef enum {
     /*
@@ -206,7 +211,7 @@ typedef enum {
      * it's set by memset(...) during shm segment init.
      */
     BDR_WORKER_EMPTY_SLOT = 0,
-   /* This shm array slot contains data for a */
+   /* This shm array slot contains data for a BdrApplyWorker */
     BDR_WORKER_APPLY,
     /* This is data for a per-database worker BdrPerdbWorker */
     BDR_WORKER_PERDB,
@@ -235,18 +240,11 @@ typedef struct BdrWorker
  
  } BdrWorker;
  
-/*
- * Params for every connection in bdr.connections.
- *
- * Contains n=bdr_max_workers elements, may have NULL entries.
- */
-extern BdrConnectionConfig **bdr_connection_configs;
-
  /* GUCs */
  extern int bdr_default_apply_delay;
  extern int bdr_max_workers;
+extern int bdr_max_databases;
  extern char *bdr_temp_dump_directory;
-extern bool bdr_init_from_basedump;
  extern bool bdr_log_conflicts_to_table;
  extern bool bdr_conflict_logging_include_tuples;
  extern bool bdr_permit_unsafe_commands;
@@ -263,13 +261,20 @@ typedef struct BdrWorkerControl
  {
     /* Must hold this lock when writing to BdrWorkerControl members */
     LWLockId     lock;
+   /* Worker generation number, incremented on postmaster restart */
+   uint16       worker_generation;
     /* Set/unset by bdr_apply_pause()/_replay(). */
     bool         pause_apply;
+   /* Is this the first startup of the supervisor? */
+   bool         is_supervisor_restart;
+   /* Latch for the supervisor worker */
+   Latch       *supervisor_latch;
     /* Array members, of size bdr_max_workers */
     BdrWorker    slots[FLEXIBLE_ARRAY_MEMBER];
  } BdrWorkerControl;
  
  extern BdrWorkerControl *BdrWorkerCtl;
+extern BdrWorker       *bdr_worker_slot;
  
  extern ResourceOwner bdr_saved_resowner;
  
@@ -294,8 +299,25 @@ extern Oid BdrLocksByOwnerRelid;
  
  extern Oid  BdrReplicationSetConfigRelid;
  
+/* Structure representing bdr_nodes record */
+typedef struct BDRNodeInfo
+{
+   /* ID */
+   uint64      sysid;
+   TimeLineID  timeline;
+   Oid         dboid;
+
+   char        status;
+
+   char       *local_dsn;
+   char       *init_from_dsn;
+} BDRNodeInfo;
+
  extern Oid bdr_lookup_relid(const char *relname, Oid schema_oid);
  
+extern void bdr_sequencer_set_nnodes(Size nnodes);
+
+
  /* apply support */
  extern void bdr_fetch_sysid_via_node_id(RepNodeId node_id, uint64 *sysid,
                                         TimeLineID *tli, Oid *remote_dboid);
@@ -385,8 +407,11 @@ PGDLLEXPORT extern Datum bdr_sequence_setval(PG_FUNCTION_ARGS);
  PGDLLEXPORT extern Datum bdr_sequence_options(PG_FUNCTION_ARGS);
  #endif
  
+extern int bdr_sequencer_get_next_free_slot(void); //XXX PERDB temp
+
+
  /* statistic functions */
-extern void bdr_count_shmem_init(size_t nnodes);
+extern void bdr_count_shmem_init(Size nnodes);
  extern void bdr_count_set_current_node(RepNodeId node_id);
  extern void bdr_count_commit(void);
  extern void bdr_count_rollback(void);
@@ -405,10 +430,10 @@ extern bool bdr_get_integer_timestamps(void);
  extern bool bdr_get_bigendian(void);
  
  /* initialize a new bdr member */
-extern void bdr_init_replica(Name dbname);
+extern void bdr_init_replica(BDRNodeInfo *local_node);
  
  /* shared memory management */
-extern void bdr_maintain_schema(void);
+extern void bdr_maintain_schema(bool update_extensions);
  extern BdrWorker* bdr_worker_shmem_alloc(BdrWorkerType worker_type,
                                          uint32 *ctl_idx);
  extern void bdr_worker_shmem_release(BdrWorker* worker, BackgroundWorkerHandle *handle);
@@ -423,20 +448,35 @@ extern void bdr_executor_always_allow_writes(bool always_allow);
  extern void bdr_queue_ddl_command(char *command_tag, char *command);
  extern void bdr_execute_ddl_command(char *cmdstr, char *perpetrator, bool tx_just_started);
  
-extern void bdr_locks_shmem_init(Size num_used_databases);
+extern void bdr_locks_shmem_init(void);
  extern void bdr_locks_check_query(void);
  
-/* background workers */
-extern void bdr_worker_init(char* dbname);
+/* background workers and supporting functions for them */
  PGDLLEXPORT extern void bdr_apply_main(Datum main_arg);
  PGDLLEXPORT extern void bdr_perdb_worker_main(Datum main_arg);
+PGDLLEXPORT extern void bdr_supervisor_worker_main(Datum main_arg);
+
+extern void bdr_worker_init(char* dbname);
+extern void bdr_supervisor_register(void);
+
+extern void bdr_sighup(SIGNAL_ARGS);
+extern void bdr_sigterm(SIGNAL_ARGS);
+
+extern int find_perdb_worker_slot(Oid dboid,
+                                    BdrWorker **worker_found);
+
+extern void bdr_launch_apply_workers(Oid dboid);
  
  /* Information functions */
  extern int bdr_parse_version(const char * bdr_version_str, int *o_major,
                              int *o_minor, int *o_rev, int *o_subrev);
  
  /* manipulation of bdr catalogs */
-extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid);
+extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli,
+                                      Oid dboid);
+extern BDRNodeInfo * bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli,
+                                         Oid dboid);
+extern void bdr_bdr_node_free(BDRNodeInfo *node);
  extern void bdr_nodes_set_local_status(char status);
  
  extern Oid GetSysCacheOidError(int cacheId, Datum key1, Datum key2, Datum key3,
@@ -463,7 +503,8 @@ bdr_copytable(PGconn *copyfrom_conn, PGconn *copyto_conn,
  
  /* helpers shared by multiple worker types */
  extern struct pg_conn* bdr_connect(const char *conninfo, Name appname,
-                                  uint64* remote_sysid_i, TimeLineID *remote_tlid_i,
+                                  uint64* remote_sysid_i,
+                                  TimeLineID *remote_tlid_i,
                                    Oid *out_dboid_i);
  
  extern struct pg_conn *
@@ -474,11 +515,7 @@ bdr_establish_connection_and_slot(const char *dsn,
                                   TimeLineID *out_timeline,
                                   Oid *out_dboid,
                                   RepNodeId *out_replication_identifier,
-                                 char **out_snapshot);
-extern void
-bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
-       Oid remote_dboid, char **out_replication_identifier,
-       Name out_slot_name);
+                                 char **out_snapshot);
  
  extern PGconn* bdr_connect_nonrepl(const char *connstring,
         const char *appnamesuffix);
diff --git a/bdr_apply.c b/bdr_apply.c

index f901f488089253961d3466e4b7ba98ff71ffd7c4..a6fe91b214ab98619d219a948750f3d8ab86e29d 100644 (file)
--- a/bdr_apply.c
+++ b/bdr_apply.c
@@ -35,6 +35,8 @@
  #include "catalog/namespace.h"
  #include "catalog/pg_type.h"
  
+#include "executor/spi.h"
+
  #include "libpq/pqformat.h"
  
  #include "mb/pg_wchar.h"
@@ -97,16 +99,10 @@ static RepNodeId        remote_origin_id = InvalidRepNodeId;
  /*
   * This code only runs within an apply bgworker, so we can stash a pointer to our
   * state in shm in a global for convenient access.
- *
- * TODO: make static once bdr_apply_main moved into bdr.c
   */
-BdrApplyWorker *bdr_apply_worker = NULL;
+static BdrApplyWorker *bdr_apply_worker = NULL;
  
-/*
- * GUCs for this apply worker - again, this is fixed for the lifetime of the
- * worker so we can stash it in a global.
- */
-BdrConnectionConfig *bdr_apply_config = NULL;
+static BdrConnectionConfig *bdr_apply_config = NULL;
  
  dlist_head bdr_lsn_association = DLIST_STATIC_INIT(bdr_lsn_association);
  
@@ -195,8 +191,7 @@ process_remote_begin(StringInfo s)
     replication_origin_xid = remote_xid;
  
     snprintf(statbuf, sizeof(statbuf),
-           "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %s, %X/%X, %s",
-            bdr_apply_config->name,
+           "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %X/%X, %s",
             (uint32) (origlsn >> 32), (uint32) origlsn,
             timestamptz_to_str(committime));
  
@@ -381,8 +376,7 @@ process_remote_commit(StringInfo s)
             && bdr_apply_worker->replay_stop_lsn <= end_lsn)
     {
         ereport(LOG,
-               (errmsg("bdr apply %s finished processing; replayed to %X/%X of required %X/%X",
-                bdr_apply_config->name,
+               (errmsg("bdr apply finished processing; replayed to %X/%X of required %X/%X",
                  (uint32)(end_lsn>>32), (uint32)end_lsn,
                  (uint32)(bdr_apply_worker->replay_stop_lsn>>32), (uint32)bdr_apply_worker->replay_stop_lsn)));
         /*
@@ -2389,6 +2383,7 @@ bdr_apply_work(PGconn* streamConn)
     }
  }
  
+
  /*
   * Entry point for a BDR apply worker.
   *
@@ -2405,27 +2400,70 @@ bdr_apply_main(Datum main_arg)
     RepNodeId   replication_identifier;
     XLogRecPtr  start_from;
     NameData    slot_name;
-   BdrWorker  *bdr_worker_slot;
+   NameData    dbname;
+   BdrWorker  *perdb;
+   uint32      worker_arg;
+   uint16      apply_worker_idx,
+               worker_generation;
+   int         perdb_worker_idx;
  
     Assert(IsBackgroundWorker);
  
+   worker_arg = DatumGetInt32(main_arg);
+
+   worker_generation = (uint16)(worker_arg >> 16);
+   apply_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+   if (worker_generation != BdrWorkerCtl->worker_generation)
+   {
+       elog(DEBUG1, "apply worker from generation %d exiting after finding shmem generation is %d",
+            worker_generation, BdrWorkerCtl->worker_generation);
+       proc_exit(0);
+   }
+
     initStringInfo(&query);
  
-   bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+   bdr_worker_slot = &BdrWorkerCtl->slots[ apply_worker_idx ];
     Assert(bdr_worker_slot->worker_type == BDR_WORKER_APPLY);
     bdr_apply_worker = &bdr_worker_slot->data.apply;
     bdr_worker_type = BDR_WORKER_APPLY;
  
-   bdr_apply_config = bdr_connection_configs[bdr_apply_worker->connection_config_idx];
-   Assert(bdr_apply_config != NULL);
-
-   bdr_worker_init(bdr_apply_config->dbname);
+   /*
+    * Get the database name to connect to from the perdb worker for this db
+    *
+    * It'd be preferable to just connect by oid, but the bgworkers interface
+    * doesn't permit us to do that, and we can't look up the syscache to find
+    * the name by oid until we're connected.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
+   perdb_worker_idx = find_perdb_worker_slot(bdr_apply_worker->dboid, NULL);
+   Assert(perdb_worker_idx >= 0);
+   perdb = &BdrWorkerCtl->slots[perdb_worker_idx];
+   Assert(perdb->worker_type == BDR_WORKER_PERDB);
+   namecpy(&dbname, &perdb->data.perdb.dbname);
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   /* Then unblock signals, connect to the db, etc */
+   bdr_worker_init(NameStr(dbname));
+
+   Assert(MyDatabaseId == bdr_apply_worker->dboid);
+
+   /* Read our connection configuration from the database */
+   bdr_apply_config = bdr_get_connection_config(
+       bdr_apply_worker->remote_sysid,
+       bdr_apply_worker->remote_timeline,
+       bdr_apply_worker->remote_dboid,
+       false);
+
+   Assert(bdr_apply_config->sysid == bdr_apply_worker->remote_sysid &&
+          bdr_apply_config->timeline == bdr_apply_worker->remote_timeline &&
+          bdr_apply_config->dboid == bdr_apply_worker->remote_dboid);
  
     CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr apply top-level resource owner");
     bdr_saved_resowner = CurrentResourceOwner;
  
     elog(DEBUG1, "%s initialized on %s",
-        MyBgworkerEntry->bgw_name, bdr_apply_config->dbname);
+        MyBgworkerEntry->bgw_name, NameStr(dbname));
  
     /* Set our local application_name for our SPI connections */
     resetStringInfo(&query);
@@ -2504,6 +2542,8 @@ bdr_apply_main(Datum main_arg)
     appendStringInfo(&query, ", db_encoding '%s'", GetDatabaseEncodingName());
     if (bdr_apply_worker->forward_changesets)
         appendStringInfo(&query, ", forward_changesets 't'");
+   if (bdr_apply_config->is_unidirectional)
+       appendStringInfo(&query, ", unidirectional 't'");
  
     appendStringInfoChar(&query, ')');
  
diff --git a/bdr_catalogs.c b/bdr_catalogs.c

index 5091747fa6d2ad586e457d848542d8d756ba2185..fb4b85078ae4d31dc2bb0db0d603e31aea158549 100644 (file)
--- a/bdr_catalogs.c
+++ b/bdr_catalogs.c
@@ -31,8 +31,13 @@
  
  #include "utils/builtins.h"
  #include "utils/guc.h"
+#include "utils/memutils.h"
  #include "utils/syscache.h"
  
+static int getattno(const char *colname);
+static char* bdr_textarr_to_identliststr(ArrayType *textarray);
+
+
  /* GetSysCacheOid equivalent that errors out if nothing is found */
  Oid
  GetSysCacheOidError(int cacheId,
@@ -70,7 +75,7 @@ bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid)
     Oid         argtypes[] = { TEXTOID, OIDOID, OIDOID };
     Datum       values[3];
     bool        isnull;
-   char        status;
+   char        status;
     char        sysid_str[33];
     Oid         schema_oid;
  
@@ -118,17 +123,108 @@ bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid)
  }
  
  /*
- * Insert a row for the local node's (sysid,tlid,dboid) with the passed status
- * into bdr.bdr_nodes. No existing row for this key may exist.
+ * Get the bdr.bdr_nodes record for the specififed node from the local
+ * bdr.bdr_nodes table via SPI.
   *
- * Unlike bdr_set_remote_status, '\0' may not be passed to delete the row, and
- * no upsert is performed. This is a simple insert only.
+ * Returns the status value, or NULL if no such row exists.
   *
- * Unlike bdr_nodes_get_local_status, only the status of the local node may
- * be set.
+ * SPI must be initialized, and you must be in a running transaction.
+ */
+BDRNodeInfo *
+bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli, Oid dboid)
+{
+   int         spi_ret;
+   Oid         argtypes[] = { TEXTOID, OIDOID, OIDOID };
+   Datum       values[3];
+   bool        isnull;
+   BDRNodeInfo *node;
+   char        sysid_str[33];
+   Oid         schema_oid;
+   MemoryContext caller_ctx;
+   MemoryContext saved_ctx PG_USED_FOR_ASSERTS_ONLY;
+
+   Assert(IsTransactionState());
+
+   /* Save the calling memory context, which we'll allocate results in */
+   caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+   Assert(MemoryContextIsValid(caller_ctx));
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   /*
+    * Determine if BDR is present on this DB. The output plugin can
+    * be started on a db that doesn't actually have BDR active, but
+    * we don't want to allow that.
+    *
+    * Check for a bdr schema.
+    */
+   schema_oid = GetSysCacheOid1(NAMESPACENAME, CStringGetDatum("bdr"));
+   if (schema_oid == InvalidOid)
+       ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+               errmsg("No bdr schema is present in database %s, cannot create a bdr slot",
+                      get_database_name(MyDatabaseId)),
+               errhint("There is no bdr.connections entry for this database on the target node or bdr is not in shared_preload_libraries")));
+
+   values[0] = CStringGetTextDatum(sysid_str);
+   values[1] = ObjectIdGetDatum(tli);
+   values[2] = ObjectIdGetDatum(dboid);
+
+   spi_ret = SPI_execute_with_args(
+           "SELECT node_status, node_local_dsn, node_init_from_dsn"
+           "  FROM bdr.bdr_nodes"
+           " WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3",
+           3, argtypes, values, NULL, false, 1);
+
+   if (spi_ret != SPI_OK_SELECT)
+       elog(ERROR, "Unable to query bdr.bdr_nodes, SPI error %d", spi_ret);
+
+   if (SPI_processed == 0)
+       return NULL;
+
+   /* Switch to calling memory context to copy results */
+   saved_ctx = MemoryContextSwitchTo(caller_ctx);
+   Assert(MemoryContextIsValid(saved_ctx));
+
+   node = palloc(sizeof(BDRNodeInfo));
+   node->sysid = sysid;
+   node->timeline = tli;
+   node->dboid = dboid;
+   node->status = DatumGetChar(SPI_getbinval(SPI_tuptable->vals[0],
+                                             SPI_tuptable->tupdesc, 1,
+                                             &isnull));
+   node->local_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+                                  SPI_tuptable->tupdesc, 2);
+   node->init_from_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+                                      SPI_tuptable->tupdesc, 3);
+
+   if (isnull)
+       elog(ERROR, "bdr.bdr_nodes.status NULL; shouldn't happen");
+
+   return node;
+}
+
+/* Free the BDRNodeInfo pointer including its properties. */
+void
+bdr_bdr_node_free(BDRNodeInfo *node)
+{
+   if (node == NULL)
+       return;
+
+   if (node->local_dsn)
+       pfree(node->local_dsn);
+   if (node->init_from_dsn)
+       pfree(node->init_from_dsn);
+   pfree(node);
+}
+
+/*
+ * Update the status field on the local node (as identified by current
+ * sysid,tlid,dboid) of bdr.bdr_nodes. The node record must already exist.
   *
- * SPI must be initialized, and you must be in a running transaction that is
- * not bound to any remote node replication state.
+ * Unlike bdr_nodes_get_local_status, this inteface does not accept
+ * sysid, tlid and dboid input but can only set the status of the local node.
   */
  void
  bdr_nodes_set_local_status(char status)
@@ -137,12 +233,21 @@ bdr_nodes_set_local_status(char status)
     Oid         argtypes[] = { CHAROID, TEXTOID, OIDOID, OIDOID };
     Datum       values[4];
     char        sysid_str[33];
+   bool        tx_started = false;
+   bool        spi_pushed;
  
-   Assert(status != '\0'); /* Cannot pass \0 to delete */
-   Assert(IsTransactionState());
+   Assert(status != '\0'); /* Cannot pass \0 */
     /* Cannot have replication apply state set in this tx */
     Assert(replication_origin_id == InvalidRepNodeId);
  
+   if (!IsTransactionState())
+   {
+       tx_started = true;
+       StartTransactionCommand();
+   }
+   spi_pushed = SPI_push_conditional();
+   SPI_connect();
+
     snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
              GetSystemIdentifier());
     sysid_str[sizeof(sysid_str)-1] = '\0';
@@ -153,17 +258,24 @@ bdr_nodes_set_local_status(char status)
     values[3] = ObjectIdGetDatum(MyDatabaseId);
  
     spi_ret = SPI_execute_with_args(
-                              "INSERT INTO bdr.bdr_nodes"
-                              " (node_status, node_sysid, node_timeline, node_dboid)"
-                              " VALUES ($1, $2, $3, $4);",
+                              "UPDATE bdr.bdr_nodes"
+                              "   SET node_status = $1"
+                              " WHERE node_sysid = $2"
+                              "   AND node_timeline = $3"
+                              "   AND node_dboid = $4;",
                                4, argtypes, values, NULL, false, 0);
  
-   if (spi_ret != SPI_OK_INSERT)
-       elog(ERROR, "Unable to insert row (status=%c, node_sysid="
+   if (spi_ret != SPI_OK_UPDATE)
+       elog(ERROR, "Unable to set status=%c of row (node_sysid="
                     UINT64_FORMAT ", node_timeline=%u, node_dboid=%u) "
-                   "into bdr.bdr_nodes: SPI error %d",
+                   "in bdr.bdr_nodes: SPI error %d",
                     status, GetSystemIdentifier(), ThisTimeLineID,
                     MyDatabaseId, spi_ret);
+
+   SPI_finish();
+   SPI_pop_conditional(spi_pushed);
+   if (tx_started)
+       CommitTransactionCommand();
  }
  
  /*
@@ -218,6 +330,301 @@ bdr_fetch_node_id_via_sysid(uint64 sysid, TimeLineID tli, Oid dboid)
     return GetReplicationIdentifier(ident, false);
  }
  
+/*
+ * Read connection configuration data from the DB and return zero or more
+ * matching palloc'd BdrConnectionConfig results in a list.
+ *
+ * A transaction must be open.
+ *
+ * The list and values are allocated in the calling memory context. By default
+ * this is the transaction memory context, but you can switch to contexts
+ * before calling.
+ *
+ * Each BdrConnectionConfig's char* fields are palloc'd values.
+ *
+ * Uses the SPI, so push/pop caller's SPI state if needed.
+ *
+ * May raise exceptions from queries, SPI errors, etc.
+ *
+ * If both an entry with conn_origin for this node and one with null
+ * conn_origin are found, only the one specific to this node is returned,
+ * as it takes precedence over any generic configuration entry.
+ */
+List*
+bdr_read_connection_configs()
+{
+   HeapTuple tuple;
+   StringInfoData query;
+   int         i;
+   int         ret;
+   List       *configs = NIL;
+   MemoryContext caller_ctx, saved_ctx;
+   char        sysid_str[33];
+   Datum       values[3];
+   Oid         types[3] = { TEXTOID, OIDOID, OIDOID };
+
+   Assert(IsTransactionState());
+
+   /* Save the calling memory context, which we'll allocate results in */
+   caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+   initStringInfo(&query);
+
+   /*
+    * Find a connections row specific to this origin node or if none
+    * exists, the default connection data for that node.
+    *
+    * Configurations for all nodes, including the local node, are read.
+    */
+   appendStringInfo(&query, "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+                            "  conn_sysid, conn_timeline, conn_dboid, "
+                            "  conn_dsn, conn_apply_delay, "
+                            "  conn_replication_sets, "
+                            "  conn_is_unidirectional, "
+                            "  conn_origin_dboid <> 0 AS origin_is_my_id "
+                            "FROM bdr.bdr_connections "
+                            "WHERE (conn_origin_sysid = '0' "
+                            "  AND  conn_origin_timeline = 0 "
+                            "  AND  conn_origin_dboid = 0) "
+                            "   OR (conn_origin_sysid = $1 "
+                            "  AND  conn_origin_timeline = $2 "
+                            "  AND  conn_origin_dboid = $3) "
+                            "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+                            "         conn_origin_sysid ASC NULLS LAST, "
+                            "         conn_timeline ASC NULLS LAST, "
+                            "         conn_dboid ASC NULLS LAST "
+                    );
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   values[0] = CStringGetTextDatum(&sysid_str[0]);
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+   SPI_connect();
+
+   ret = SPI_execute_with_args(query.data, 3, types, values, NULL, false, 0);
+
+   if (ret != SPI_OK_SELECT)
+       elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+   /* Switch to calling memory context to copy results */
+   saved_ctx = MemoryContextSwitchTo(caller_ctx);
+
+   for (i = 0; i < SPI_processed; i++)
+   {
+       Datum           tmp_datum;
+       bool            isnull;
+       ArrayType      *conn_replication_sets;
+       char           *tmp_sysid;
+
+       BdrConnectionConfig *cfg = palloc(sizeof(BdrConnectionConfig));
+
+       tuple = SPI_tuptable->vals[i];
+
+       /*
+        * Fetch tuple attributes
+        *
+        * Note: SPI_getvalue calls the output function for the type, so the
+        * string is allocated in our memory context and doesn't need copying.
+        */
+       tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+                                getattno("conn_sysid"));
+
+       if (sscanf(tmp_sysid, UINT64_FORMAT, &cfg->sysid) != 1)
+           elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_timeline"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->timeline = DatumGetObjectId(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_dboid"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->dboid = DatumGetObjectId(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_is_unidirectional"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->is_unidirectional = DatumGetBool(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("origin_is_my_id"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->origin_is_my_id = DatumGetBool(tmp_datum);
+
+
+       cfg->dsn = SPI_getvalue(tuple,
+                                            SPI_tuptable->tupdesc,
+                                            getattno("conn_dsn"));
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_apply_delay"), &isnull);
+       if (isnull)
+           cfg->apply_delay = -1;
+       else
+           cfg->apply_delay = DatumGetInt32(tmp_datum);
+
+       /*
+        * Replication sets are stored in the catalogs as a text[]
+        * of identifiers, so we'll want to unpack that.
+        */
+
+       conn_replication_sets = (ArrayType*)
+           SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                         getattno("conn_replication_sets"), &isnull);
+
+       if (isnull)
+           cfg->replication_sets = NULL;
+       else
+       {
+           cfg->replication_sets =
+               bdr_textarr_to_identliststr(DatumGetArrayTypeP(conn_replication_sets));
+       }
+
+       configs = lcons(cfg, configs);
+
+   }
+
+   MemoryContextSwitchTo(saved_ctx);
+
+   SPI_finish();
+
+   MemoryContextSwitchTo(caller_ctx);
+
+   return configs;
+}
+
+void
+bdr_free_connection_config(BdrConnectionConfig *cfg)
+{
+   if (cfg->dsn != NULL)
+       pfree(cfg->dsn);
+   if (cfg->replication_sets != NULL)
+       pfree(cfg->replication_sets);
+}
+
+/*
+ * Fetch the connection configuration for the local node, i.e. the entry
+ * with our (conn_sysid, conn_tlid, conn_dboid).
+ */
+BdrConnectionConfig*
+bdr_get_connection_config(uint64 sysid, TimeLineID timeline, Oid dboid,
+                         bool missing_ok)
+{
+   List *configs;
+   ListCell *lc;
+   MemoryContext saved_ctx;
+   BdrConnectionConfig *found_config = NULL;
+   bool tx_started = false;
+
+   Assert(MyDatabaseId != InvalidOid);
+
+   if (!IsTransactionState())
+   {
+       tx_started = true;
+       StartTransactionCommand();
+   }
+
+   saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+   configs = bdr_read_connection_configs();
+   MemoryContextSwitchTo(saved_ctx);
+
+   /*
+    * TODO DYNCONF Instead of reading all configs and then discarding all but
+    * the interesting one, we should really be doing a different query that
+    * returns only the configuration of interest. As this runs only during apply
+    * worker startup the impact is negligible.
+    */
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = (BdrConnectionConfig*) lfirst(lc);
+
+       if (cfg->sysid == sysid
+           && cfg->timeline == timeline
+           && cfg->dboid == dboid)
+       {
+           found_config = cfg;
+           break;
+       }
+       else
+       {
+           bdr_free_connection_config(cfg);
+       }
+   }
+
+   if (found_config == NULL && !missing_ok)
+       elog(ERROR, "Failed to find expected bdr.connections row "
+                   "(conn_sysid,conn_timeline,conn_dboid) = "
+                   "("UINT64_FORMAT",%u,%u) "
+                   "in bdr.bdr_connections",
+                   sysid, timeline, dboid);
+
+   if (tx_started)
+       CommitTransactionCommand();
+
+   list_free(configs);
+
+   return found_config;
+}
+
+
+static int
+getattno(const char *colname)
+{
+   int attno;
+
+   attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+   if (attno == SPI_ERROR_NOATTRIBUTE)
+       elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+   return attno;
+}
+
+/*
+ * Given a text[] Datum guaranteed to contain no nulls, return an
+ * identifier-quoted comma-separated string allocated in the current memory
+ * context.
+ */
+static char*
+bdr_textarr_to_identliststr(ArrayType *textarray)
+{
+   Datum          *elems;
+   int             nelems, i;
+   StringInfoData  si;
+
+   deconstruct_array(textarray,
+                     TEXTOID, -1, false, 'i',
+                     &elems, NULL, &nelems);
+
+   if (nelems == 0)
+       return pstrdup("");
+
+   initStringInfo(&si);
+
+   appendStringInfoString(&si,
+       quote_identifier(TextDatumGetCString(elems[0])));
+   for (i = 1; i < nelems; i++)
+   {
+       appendStringInfoString(&si, ",");
+       appendStringInfoString(&si,
+           quote_identifier(TextDatumGetCString(elems[i])));
+   }
+
+   /*
+    * The stringinfo is on the stack, but its data element is palloc'd
+    * in the caller's context and can be returned safely.
+    */
+   return si.data;
+
+}
+
  /*
   * Helper to format node identity info into buffers, which must already be
   * allocated and big enough to hold a unit64 + terminator (33 bytes).
diff --git a/bdr_common.c b/bdr_common.c

new file mode 100644 (file)

index 0000000..c4ad1cf
--- /dev/null
+++ b/bdr_common.c
@@ -0,0 +1,39 @@
+/*
+ * bdr_common.c
+ *
+ * BiDirectionalReplication
+ *
+ * Utility functions that can be share between extension and cli
+ * (don't require server side libraries).
+ *
+ * Copyright (c) 2015, PostgreSQL Global Development Group
+ *
+ * bdr_common.c
+ */
+
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "nodes/pg_list.h"
+
+#include "bdr_internal.h"
+
+
+/*
+ * Format slot name string from node identifiers.
+ */
+void
+bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+             Oid dboid, Oid local_dboid)
+{
+   char        sysid_str[33];
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   snprintf(NameStr(*slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+            local_dboid, sysid_str, tlid, dboid,
+            EMPTY_REPLICATION_NAME);
+   NameStr(*slot_name)[NAMEDATALEN-1] = '\0';
+}
diff --git a/bdr_conflict_logging.c b/bdr_conflict_logging.c

index d979bbe93f208c57e07c5f6b11c5df6e1fff514a..0447a11d9b51b5eda630eedee718823b2ac1ab53 100644 (file)
--- a/bdr_conflict_logging.c
+++ b/bdr_conflict_logging.c
@@ -45,12 +45,6 @@ static Oid BdrConflictTypeOid = InvalidOid;
  static Oid BdrConflictResolutionOid = InvalidOid;
  static Oid BdrConflictHistorySeqId = InvalidOid;
  
-/*
- * All this code runs only in the context of an apply worker, so
- * we can access the apply worker state global safely
- */
-extern BdrApplyWorker *bdr_apply_worker;
-
  #define BDR_CONFLICT_HISTORY_COLS 30
  #define SYSID_DIGITS 33
  
diff --git a/bdr_executor.c b/bdr_executor.c

index dfef1a5f772cd2a6761300738238dde454336269..e700cf27ebd8927b74798f944ec3a580311edf70 100644 (file)
--- a/bdr_executor.c
+++ b/bdr_executor.c
@@ -773,7 +773,7 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
     bool        performs_writes = false;
     ListCell   *l;
  
-   if (bdr_always_allow_writes || !bdr_is_bdr_activated_db(MyDatabaseId))
+   if (bdr_always_allow_writes)
         goto done;
  
     /* identify whether this is a modifying statement */
@@ -786,6 +786,9 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
     if (!performs_writes)
         goto done;
  
+   if (!bdr_is_bdr_activated_db(MyDatabaseId))
+       goto done;
+
  #ifdef BUILDING_BDR
     bdr_locks_check_query();
  #endif
diff --git a/bdr_init_copy.c b/bdr_init_copy.c

index 134d98010fe53b47ca4de3efa1099187f07fddfb..d0e98ef79bc11c14fa1031eb944f3e076bc33941 100644 (file)
--- a/bdr_init_copy.c
+++ b/bdr_init_copy.c
@@ -13,6 +13,8 @@
  
  #include "postgres_fe.h"
  
+#include "getopt_long.h"
+
  #include "port.h"
  
  #include "libpq-fe.h"
@@ -45,74 +47,102 @@
  typedef struct RemoteInfo {
     uint64      sysid;
     TimeLineID  tlid;
-   Oid         dboid;
+   int         numdbs;
+   Oid        *dboids;
+   char      **dbnames;
  } RemoteInfo;
  
-static char            *argv0 = NULL;
-static const char  *progname;
-static uint64       system_identifier;
-static NameData         restore_point_name;
-static char            *data_dir = NULL;
-static char            *config_options = "";
-static char             pid_file[MAXPGPATH];
-static time_t       start_time;
+typedef struct NodeInfo {
+   uint64      remote_sysid;
+   TimeLineID  remote_tlid;
+   uint64      local_sysid;
+   TimeLineID  local_tlid;
+} NodeInfo;
+
+typedef enum {
+   VERBOSITY_NORMAL,
+   VERBOSITY_VERBOSE,
+   VERBOSITY_DEBUG
+} VerbosityLevelEnum;
+
+static char           *argv0 = NULL;
+static const char  *progname;
+static char           *data_dir = NULL;
+static char            pid_file[MAXPGPATH];
+static time_t      start_time;
+static VerbosityLevelEnum  verbosity = VERBOSITY_NORMAL;
  
  /* defined as static so that die() can close them */
  static PGconn      *local_conn = NULL;
  static PGconn      *remote_conn = NULL;
  
-BdrConnectionConfig    **bdr_connection_configs;
-size_t              bdr_connection_config_count;
-
  static void signal_handler(int sig);
  static void usage(void);
  static void die(const char *fmt,...)
  __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
-static void print_msg(const char *fmt,...)
-__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
+static void print_msg(VerbosityLevelEnum level, const char *fmt,...)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
  
-static int run_pg_ctl(const char *arg, const char *opts);
-static char *get_postgres_guc_value(char *guc, char *defval);
-static bool wait_postmaster_connection(void);
-static void wait_postgres_shutdown(void);
+static int run_pg_ctl(const char *arg);
+static void run_basebackup(const char *remote_connstr, const char *data_dir);
+static void wait_postmaster_connection(const char *connstr);
+static void wait_postmaster_shutdown(void);
  
-#ifdef BUILDING_UDR
-static void initialize_bdr(PGconn *conn);
-#endif
-static void remove_unwanted_state(void);
-static void initialize_replication_identifiers(char *remote_lsn);
-static void create_replication_identifier(PGconn *conn,
-               const char *remote_ident, char *remote_lsn);
-static char *create_restore_point(char *remote_connstr);
-static void initialize_replication_slots(bool init_replica);
-static void create_replication_slot(PGconn *conn, Name slot_name);
-static RemoteInfo *get_remote_info(PGconn *conn, char* aux_connstr);
-static Oid get_dboid_from_dbname(PGconn *conn, const char* dbname);
+static void validate_remote_node(PGconn *conn);
+static void initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+                                 char *remote_connstr);
+static void remove_unwanted_files(void);
+static void remove_unwanted_data(PGconn *conn, char *dbname);
+static void initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn);
+static char *create_restore_point(PGconn *conn, char *restore_point_name);
+static void initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid);
+static void bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr);
+
+static RemoteInfo *get_remote_info(char* connstr);
+
+static void initialize_data_dir(char *data_dir, char *connstr,
+                   char *postgresql_conf, char *pg_hba_conf);
  
  static uint64 GenerateSystemIdentifier(void);
-static int set_sysid(void);
+static int set_sysid(uint64 sysid);
  
-static void read_bdr_config(void);
  static void WriteRecoveryConf(PQExpBuffer contents);
+static void CopyConfFile(char *fromfile, char *tofile);
  
-static char *detect_local_conninfo(void);
-static char *detect_remote_conninfo(void);
-char *get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser);
-static char *PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values);
-static char *escapeConninfoValue(const char *val);
+char *get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser);
+static char *PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values);
+static void appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str);
  
-static bool parse_bool(const char *value, bool *result);
-static bool parse_bool_with_len(const char *value, size_t len, bool *result);
-static char *trimwhitespace(const char *str);
-static char    **split_list_guc(char *str, size_t *count);
-
-static bool is_pg_dir(char *path);
+static bool file_exists(const char *path);
+static bool is_pg_dir(const char *path);
+static void copy_file(char *fromfile, char *tofile);
  static char *find_other_exec_or_die(const char *argv0, const char *target, const char *versionstr);
  static bool postmaster_is_alive(pid_t pid);
  static long get_pgpid(void);
-static char **readfile(const char *path);
-static void free_readfile(char **optlines);
  
+static PGconn *
+connectdb(char *connstr, const char *dbname)
+{
+   PGconn *conn;
+   char   *connstring = connstr;
+
+   /* TODO: deparse and reconstruct the connection string properly. */
+   if (dbname)
+   {
+       PQExpBuffer  connbuf = createPQExpBuffer();
+
+       printfPQExpBuffer(connbuf, "%s dbname=", connstr);
+       appendPQExpBufferConnstrValue(connbuf, dbname);
+       connstring = pg_strdup(connbuf->data);
+       destroyPQExpBuffer(connbuf);
+   }
+
+   conn = PQconnectdb(connstring);
+   if (PQstatus(conn) != CONNECTION_OK)
+       die(_("Connection to database failed: %s, connection string was: %s\n"), PQerrorMessage(conn), connstring);
+
+   return conn;
+}
  
  void signal_handler(int sig)
  {
@@ -129,13 +159,40 @@ main(int argc, char **argv)
     int i;
     int c;
     PQExpBuffer recoveryconfcontents = createPQExpBuffer();
-   char *remote_lsn;
-   bool hot_standby;
+   RemoteInfo *remote_info;
+   NodeInfo    node_info;
+   char        restore_point_name[NAMEDATALEN];
+   char       *remote_lsn;
+   bool        stop = false;
+   int         optindex;
     char *local_connstr = NULL;
+   char *local_dbhost = NULL,
+        *local_dbport = NULL,
+        *local_dbuser = NULL;
     char *remote_connstr = NULL;
-   char *dbhost = NULL,
-        *dbport = NULL,
-        *dbuser = NULL;
+   char *remote_dbhost = NULL,
+        *remote_dbport = NULL,
+        *remote_dbuser = NULL;
+   char *postgresql_conf = NULL,
+        *pg_hba_conf = NULL,
+        *recovery_conf = NULL;
+
+   static struct option long_options[] = {
+       {"pgdata", required_argument, NULL, 'D'},
+       {"remote-dbname", required_argument, NULL, 'd'},
+       {"remote-host", required_argument, NULL, 'h'},
+       {"remote-port", required_argument, NULL, 'p'},
+       {"remote-user", required_argument, NULL, 'U'},
+       {"local-dbname", required_argument, NULL, 2},
+       {"local-host", required_argument, NULL, 3},
+       {"local-port", required_argument, NULL, 4},
+       {"local-user", required_argument, NULL, 5},
+       {"postgresql-conf", required_argument, NULL, 6},
+       {"hba-conf", required_argument, NULL, 7},
+       {"recovery-conf", required_argument, NULL, 8},
+       {"stop", no_argument, NULL, 's'},
+       {NULL, 0, NULL, 0}
+   };
  
     argv0 = argv[0];
     progname = get_progname(argv[0]);
@@ -156,30 +213,66 @@ main(int argc, char **argv)
     }
  
     /* Option parsing and validation */
-   while ((c = getopt(argc, argv, "D:d:h:o:p:U:")) != -1)
+   while ((c = getopt_long(argc, argv, "D:d:h:p:s:U:v", long_options, &optindex)) != -1)
     {
         switch (c)
         {
             case 'D':
                 data_dir = pg_strdup(optarg);
                 break;
-           case 'o':
-               config_options = pg_strdup(optarg);
-               break;
             case 'd':
                 remote_connstr = pg_strdup(optarg);
                 break;
             case 'h':
-               dbhost = pg_strdup(optarg);
+               remote_dbhost = pg_strdup(optarg);
                 break;
             case 'p':
-               dbport = pg_strdup(optarg);
+               remote_dbport = pg_strdup(optarg);
                 break;
             case 'U':
-               dbuser = pg_strdup(optarg);
+               remote_dbuser = pg_strdup(optarg);
+               break;
+           case 'v':
+               verbosity++;
+               break;
+           case 2:
+               local_connstr = pg_strdup(optarg);
+               break;
+           case 3:
+               local_dbhost = pg_strdup(optarg);
+               break;
+           case 4:
+               local_dbport = pg_strdup(optarg);
+               break;
+           case 5:
+               local_dbuser = pg_strdup(optarg);
+               break;
+           case 6:
+               {
+                   postgresql_conf = pg_strdup(optarg);
+                   if (postgresql_conf != NULL && !file_exists(postgresql_conf))
+                       die(_("The specified postgresql.conf file does not exist."));
+                   break;
+               }
+           case 7:
+               {
+                   pg_hba_conf = pg_strdup(optarg);
+                   if (pg_hba_conf != NULL && !file_exists(pg_hba_conf))
+                       die(_("The specified pg_hba.conf file does not exist."));
+                   break;
+               }
+           case 8:
+               {
+                   recovery_conf = pg_strdup(optarg);
+                   if (recovery_conf != NULL && !file_exists(recovery_conf))
+                       die(_("The specified recovery.conf file does not exist."));
+                   break;
+               }
+           case 's':
+               stop = true;
                 break;
             default:
-               fprintf(stderr, _("%s: unknown option\n"), progname);
+               fprintf(stderr, _("Unknown option\n"));
                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
                 exit(1);
         }
@@ -187,110 +280,199 @@ main(int argc, char **argv)
  
     if (data_dir == NULL)
     {
-       fprintf(stderr, _("%s: no data directory specified\n"), progname);
+       fprintf(stderr, _("No data directory specified\n"));
         fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
         exit(1);
     }
-   if (!is_pg_dir(data_dir))
-   {
-       die(_("%s: \"%s\" is not valid postgres data directory\n"), progname, data_dir);
-   }
-   snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
  
-   print_msg(_("%s: starting...\n"), progname);
+   remote_connstr = get_connstr(remote_connstr, remote_dbhost, remote_dbport, remote_dbuser);
+   local_connstr = get_connstr(local_connstr, local_dbhost, local_dbport, local_dbuser);
  
+   if (!remote_connstr || !strlen(remote_connstr))
+       die(_("Remote connection must be specified.\n"));
+   if (!local_connstr || !strlen(local_connstr))
+       die(_("Local connection must be specified.\n"));
+
+   print_msg(VERBOSITY_NORMAL, _("%s: starting ...\n"), progname);
+
+   /*
+    * Generate new identifier for local node.
+    */
+   node_info.local_sysid = GenerateSystemIdentifier();
+   print_msg(VERBOSITY_VERBOSE,
+             _("Generated new local system identifier: "UINT64_FORMAT"\n"),
+             node_info.local_sysid);
+
+   /* Read the remote server indetification. */
+   print_msg(VERBOSITY_NORMAL,
+             _("Getting remote server identification ...\n"));
+   remote_info = get_remote_info(remote_connstr);
+
+   /* If there are no BDR enabled dbs, just bail. */
+   if (remote_info->numdbs < 1)
+       die(_("Remote node does not have any BDR enabled databases.\n"));
+
+   print_msg(VERBOSITY_NORMAL,
+             _("Detected %d BDR database(s) on remote server\n"),
+             remote_info->numdbs);
+
+   node_info.remote_sysid = remote_info->sysid;
+   node_info.remote_tlid = remote_info->tlid;
     /*
-    * Initialization
+    * Once the physical replication reaches the restore point, it will
+    * bump the timeline by one.
      */
-   system_identifier = GenerateSystemIdentifier();
-   print_msg(_("Assigning new system identifier: "UINT64_FORMAT"...\n"), system_identifier);
+   node_info.local_tlid = remote_info->tlid + 1;
  
-   read_bdr_config();
+   print_msg(VERBOSITY_NORMAL,
+             _("Updating BDR configuration on the remote node:\n"));
  
-   if (!remote_connstr && !dbhost && !dbport && !dbuser)
-       remote_connstr = detect_remote_conninfo();
-   else
-       remote_connstr = get_conninfo(remote_connstr, dbhost, dbport, dbuser);
+   /* Initialize remote node. */
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       char *dbname = remote_info->dbnames[i];
+       remote_conn = connectdb(remote_connstr, dbname);
  
-   if (!remote_connstr || !strlen(remote_connstr))
-       die(_("Could not detect remote connection\n"));
+       /*
+        * Make sure that we can use the remote node as init node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: validating BDR configuration ...\n"), dbname);
+       validate_remote_node(remote_conn);
+
+       /*
+        * Create replication slots on remote node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: creating replication slot ...\n"), dbname);
+       initialize_replication_slot(remote_conn, &node_info, remote_info->dboids[i]);
  
-   local_connstr = detect_local_conninfo();
-   if (local_connstr == NULL)
-       die(_("Failed to detect local connection info. Please specify replica_local_dsn in the postgresql.conf.\n"));
+       /*
+        * Create node entry for future local node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: creating node entry for local node ...\n"), dbname);
+       initialize_node_entry(remote_conn, &node_info, remote_info->dboids[i],
+                             remote_connstr);
  
-   /* Hot standby would start cluster in read only mode, we don't want that. */
-   if (!parse_bool(get_postgres_guc_value("hot_standby", NULL), &hot_standby))
-       die(_("Invalid boolean value for configuration parameter \"hot_standby\"\n"));
-   if (hot_standby)
-       die(_("Cluster cannot be configured with hot_standby = on when using bdr\n"));
+       /* Don't hold connection since the next step might take long time. */
+       PQfinish(remote_conn);
+       local_conn = NULL;
+   }
  
-   remove_unwanted_state();
+   /*
+    * Create basebackup or use existing one
+    */
+   initialize_data_dir(data_dir, remote_connstr, postgresql_conf, pg_hba_conf);
+   snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
  
     /*
-    * Initialization done, create replication slots to init node
-    * and restore point on remote side.
+    * Create restore point to which we will catchup via physical replication.
      */
-   print_msg(_("Creating primary replication slots...\n"));
-   initialize_replication_slots(true);
+   remote_conn = PQconnectdb(remote_connstr);
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+       die(_("Connection to remote node failed: %s"), PQerrorMessage(remote_conn));
+
+   print_msg(VERBOSITY_NORMAL, _("Creating restore point on remote node ...\n"));
+
+   snprintf(restore_point_name, NAMEDATALEN,
+            "bdr_"UINT64_FORMAT, node_info.local_sysid);
+   remote_lsn = create_restore_point(remote_conn, restore_point_name);
  
-   print_msg(_("Creating restore point...\n"));
-   snprintf(NameStr(restore_point_name), NAMEDATALEN,
-            "bdr_"UINT64_FORMAT, system_identifier);
-   remote_lsn = create_restore_point(remote_connstr);
+   PQfinish(remote_conn);
  
     /*
      * Get local db to consistent state (for lsn after slot creation).
      */
-   print_msg(_("Bringing cluster to the restore point...\n"));
-   appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
-   appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", NameStr(restore_point_name));
+   print_msg(VERBOSITY_NORMAL,
+             _("Bringing local node to the restore point ...\n"));
+   if (recovery_conf)
+   {
+       CopyConfFile(recovery_conf, "recovery.conf");
+   }
+   else
+   {
+       appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
+       appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
+   }
+   appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", restore_point_name);
     appendPQExpBuffer(recoveryconfcontents, "recovery_target_inclusive = true\n");
-   appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
     WriteRecoveryConf(recoveryconfcontents);
  
-   run_pg_ctl("start -w -l \"bdr_init_copy_postgres.log\"",
-#ifdef BUILDING_BDR
-              "-c shared_preload_libraries=''"
-#else
-              ""
-#endif
-              );
-   if (!wait_postmaster_connection())
-       die(_("Could not connect to local node"));
+   /*
+    * Start local node with BDR disabled, and wait until it starts accepting
+    * connections which means it has caught up to the restore point.
+    */
+   run_pg_ctl("start -l \"bdr_init_copy_postgres.log\" -o \"-c shared_preload_libraries=''\"");
+   wait_postmaster_connection(local_connstr);
  
     /*
-    * Postgres should have reached restore point and is accepting connections,
-    * create slots to other nodes and local replication identifiers.
+    * Clean any per-node data that were copied by pg_basebackup.
      */
-   local_conn = PQconnectdb(local_connstr);
-   if (PQstatus(local_conn) != CONNECTION_OK)
-       die(_("Connection to database failed: %s"), PQerrorMessage(local_conn));
-
-#ifdef BUILDING_UDR
-   print_msg(_("Ensuring bdr extension is installed...\n"));
-   initialize_bdr(remote_conn);
-   initialize_bdr(local_conn);
-#endif
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       local_conn = connectdb(local_connstr, remote_info->dbnames[i]);
+
+       remove_unwanted_data(local_conn, remote_info->dbnames[i]);
+
+       PQfinish(local_conn);
+       local_conn = NULL;
+   }
  
-   print_msg(_("Creating secondary replication slots...\n"));
-   initialize_replication_slots(false);
-   print_msg(_("Creating local replication identifier...\n"));
-   initialize_replication_identifiers(remote_lsn);
+   /* Stop Postgres so we can reset system id and start it with BDR loaded. */
+   run_pg_ctl("stop");
+   wait_postmaster_shutdown();
  
-   PQfinish(local_conn);
-   local_conn = NULL;
+   /*
+    * Individualize the local node by changing the system identifier.
+    */
+   set_sysid(node_info.local_sysid);
  
     /*
-    * Make this node functional as individual bdr node and start it.
+    * Start the node again, now with BDR active so that we can join the node
+    * to the BDR cluster. This is final start, so don't log to to special log
+    * file anymore.
      */
-   run_pg_ctl("stop", "");
-   wait_postgres_shutdown();
+   print_msg(VERBOSITY_NORMAL,
+             _("Initializing BDR on the local node:\n"));
+
+   run_pg_ctl("start -l \"bdr_init_copy_postgres.log\"");
+   wait_postmaster_connection(local_connstr);
+
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       char *dbname = remote_info->dbnames[i];
+
+       local_conn = connectdb(local_connstr, dbname);
+
+       /*
+        * Create the identifier which is setup with the position to which we already
+        * caught up using physical replication.
+        */
+       print_msg(VERBOSITY_VERBOSE,
+                 _(" %s: creating replication identifier ...\n"), dbname);
+       initialize_replication_identifier(local_conn, &node_info, remote_info->dboids[i], remote_lsn);
+
+       /*
+        * And finally add the node to the cluster.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: adding the database to BDR cluster ...\n"), dbname);
+       bdr_node_start(local_conn, remote_connstr, local_connstr);
  
-   set_sysid();
+       PQfinish(local_conn);
+       local_conn = NULL;
+   }
+
+   /* If user does not want the node to be running at the end, stop it. */
+   if (stop)
+   {
+       print_msg(VERBOSITY_NORMAL, _("Stopping the local node ...\n"));
+       run_pg_ctl("stop");
+       wait_postmaster_shutdown();
+   }
  
-   print_msg(_("Starting the cluster...\n"));
-   run_pg_ctl("start -w", "-c bdr.init_from_basedump=true");
+   print_msg(VERBOSITY_NORMAL, _("All done\n"));
  
     return 0;
  }
@@ -302,19 +484,29 @@ main(int argc, char **argv)
  static void
  usage(void)
  {
-   printf(_("%s initializes bdr from PostgreSQL instance made using pg_basebackup.\n\n"), progname);
-   printf(_("pg_basebackup -X stream must be used to populate the data directory before\n"));
-   printf(_("running %s to initialize BDR on it.\n\n"), progname);
+   printf(_("%s initializes new BDR node from existing BDR instance.\n\n"), progname);
     printf(_("Usage:\n"));
     printf(_("  %s [OPTION]...\n"), progname);
     printf(_("\nGeneral options:\n"));
-   printf(_("  -D, --pgdata=DIRECTORY base backup directory\n"));
-   printf(_("  -o                     configuration options passed to pg_ctl's -o\n"));
+   printf(_("  -D, --pgdata=DIRECTORY data directory to be used for new nodem,\n"));
+   printf(_("                         can be either empty/non-existing directory,\n"));
+   printf(_("                         or directory populated using pg_basebackup -X stream\n"));
+   printf(_("                         command\n"));
+   printf(_("  -s, --stop             stop the server once the initialization is done\n"));
+   printf(_("  --postgresql-conf      path to the new postgresql.conf\n"));
+   printf(_("  --hba-conf             path to the new pg_hba.conf\n"));
+   printf(_("  --recovery-conf        path to the template recovery.conf\n"));
     printf(_("\nConnection options:\n"));
-   printf(_("  -d, --dbname=CONNSTR   connection string\n"));
-   printf(_("  -h, --host=HOSTNAME    database server host or socket directory\n"));
-   printf(_("  -p, --port=PORT        database server port number\n"));
-   printf(_("  -U, --username=NAME    connect as specified database user\n"));
+   printf(_("  -d, --remote-dbname=CONNSTR\n"));
+   printf(_("                         connection string for remote node\n"));
+   printf(_("  -h, --remote-host=HOSTNAME\n"));
+   printf(_("                         server host or socket directory for remote node\n"));
+   printf(_("  -p, --remote-port=PORT server port number for remote node\n"));
+   printf(_("  -U, --remote-user=NAME connect as specified database user to the remote node\n"));
+   printf(_("  --local-dbname=CONNSTR connection string for local node\n"));
+   printf(_("  --local-host=HOSTNAME  server host or socket directory for local node\n"));
+   printf(_("  --local-port=PORT      server port number for local node\n"));
+   printf(_("  --local-user=NAME      connect as specified database user to the local node\n"));
  }
  
  /*
@@ -328,11 +520,13 @@ die(const char *fmt,...)
     vfprintf(stderr, fmt, argptr);
     va_end(argptr);
  
-   PQfinish(local_conn);
-   PQfinish(remote_conn);
+   if (local_conn)
+       PQfinish(local_conn);
+   if (remote_conn)
+       PQfinish(remote_conn);
  
     if (get_pgpid())
-       run_pg_ctl("stop -s", "");
+       run_pg_ctl("stop -s");
  
     exit(1);
  }
@@ -341,13 +535,16 @@ die(const char *fmt,...)
   * Print message to stdout and flush
   */
  static void
-print_msg(const char *fmt,...)
+print_msg(VerbosityLevelEnum level, const char *fmt,...)
  {
-   va_list argptr;
-   va_start(argptr, fmt);
-   vfprintf(stdout, fmt, argptr);
-   va_end(argptr);
-   fflush(stdout);
+   if (verbosity >= level)
+   {
+       va_list argptr;
+       va_start(argptr, fmt);
+       vfprintf(stdout, fmt, argptr);
+       va_end(argptr);
+       fflush(stdout);
+   }
  }
  
  
@@ -355,15 +552,19 @@ print_msg(const char *fmt,...)
   * Start pg_ctl with given argument(s) - used to start/stop postgres
   */
  static int
-run_pg_ctl(const char *arg, const char *opts)
+run_pg_ctl(const char *arg)
  {
     int          ret;
     PQExpBuffer  cmd = createPQExpBuffer();
     char        *exec_path = find_other_exec_or_die(argv0, "pg_ctl", "pg_ctl (PostgreSQL) " PG_VERSION "\n");
  
-   appendPQExpBuffer(cmd, "%s %s -D \"%s\" -o \"%s %s\"", exec_path, arg, data_dir,
-                     opts, config_options);
+   appendPQExpBuffer(cmd, "%s %s -D \"%s\" -s", exec_path, arg, data_dir);
+
+   /* Run pg_ctl in silent mode unless we run in debug mode. */
+   if (verbosity < VERBOSITY_DEBUG)
+       appendPQExpBuffer(cmd, " -s");
  
+   print_msg(VERBOSITY_DEBUG, _("Running pg_ctl: %s.\n"), cmd->data);
     ret = system(cmd->data);
  
     destroyPQExpBuffer(cmd);
@@ -373,53 +574,43 @@ run_pg_ctl(const char *arg, const char *opts)
  
  
  /*
- * Ugly way to read postgresql.conf
+ * Run pg_basebackup to create the copy of the origin node.
   */
-static char *
-get_postgres_guc_value(char *guc, char *defval)
+static void
+run_basebackup(const char *remote_connstr, const char *data_dir)
  {
-   FILE        *fp;
-   int          status;
+   int          ret;
     PQExpBuffer  cmd = createPQExpBuffer();
-   char        *exec_path = find_other_exec_or_die(argv0, "postgres", PG_BACKEND_VERSIONSTR);
-   PQExpBuffer  retbuf = createPQExpBuffer();
-   char         buf[8192];
-   char        *ret;
-
-   printfPQExpBuffer(cmd, "%s -D \"%s\" %s -C \"%s\" 2>\"%s\"",
-                     exec_path, data_dir, config_options, guc, DEVNULL);
+   char        *exec_path = find_other_exec_or_die(argv0, "pg_basebackup", "pg_basebackup (PostgreSQL) " PG_VERSION "\n");
  
-   fp = popen(cmd->data, "r");
-   while (fgets(buf, sizeof(buf), fp) != NULL)
-       appendPQExpBufferStr(retbuf, buf);
+   appendPQExpBuffer(cmd, "%s -D \"%s\" -d \"%s\" -X s -P", exec_path, data_dir, remote_connstr);
  
-   status = pclose(fp);
-   destroyPQExpBuffer(cmd);
+   /* Run pg_basebackup in verbose mode if we are running in verbose mode. */
+   if (verbosity >= VERBOSITY_VERBOSE)
+       appendPQExpBuffer(cmd, " -v");
  
-   if (status != 0)
-   {
-       destroyPQExpBuffer(retbuf);
-       return defval;
-   }
+   print_msg(VERBOSITY_DEBUG, _("Running pg_basebackup: %s.\n"), cmd->data);
+   ret = system(cmd->data);
  
-   ret = trimwhitespace(retbuf->data);
-   destroyPQExpBuffer(retbuf);
+   destroyPQExpBuffer(cmd);
  
-   return ret;
+   if (ret != 0)
+       die(_("pg_basebackup failed, cannot continue.\n"));
  }
  
  /*
   * Set system identifier to system id we used for registering the slots.
   */
  static int
-set_sysid(void)
+set_sysid(uint64 sysid)
  {
     int          ret;
     PQExpBuffer  cmd = createPQExpBuffer();
     char        *exec_path = find_other_exec_or_die(argv0, "bdr_resetxlog", "bdr_resetxlog (PostgreSQL) " PG_VERSION "\n");
  
-   appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, system_identifier, data_dir);
+   appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, sysid, data_dir);
  
+   print_msg(VERBOSITY_DEBUG, _("Running bdr_resetxlog: %s.\n"), cmd->data);
     ret = system(cmd->data);
  
     destroyPQExpBuffer(cmd);
@@ -427,105 +618,11 @@ set_sysid(void)
     return ret;
  }
  
-
-/*
- * Read bdr configuration
- *
- * This is somewhat ugly version of bdr_create_con_gucs and parts of _PG_init
- */
-static void
-read_bdr_config(void)
-{
-   char        *connections;
-   char        *errormsg = NULL;
-   int         connection_config_idx;
-   size_t      connection_count = 0;
-   char        **connames;
-   PQconninfoOption *options;
-   PQconninfoOption *cur_option;
-
-   connections = get_postgres_guc_value("bdr.connections", NULL);
-   if (!connections)
-       die(_("bdr.connections is empty\n"));
-
-   connames = split_list_guc(connections, &connection_count);
-   pg_free(connections);
-
-   bdr_connection_config_count = connection_count;
-   bdr_connection_configs = (BdrConnectionConfig**)
-       pg_malloc0(bdr_connection_config_count * sizeof(BdrConnectionConfig*));
-
-   for (connection_config_idx = 0; connection_config_idx < connection_count; connection_config_idx++)
-   {
-       char    *name = (char *) connames[connection_config_idx];
-       char    *optname_dsn = pg_malloc(strlen(name) + 30);
-       char    *optname_local_dsn = pg_malloc(strlen(name) + 30);
-       char    *optname_replica = pg_malloc(strlen(name) + 30);
-       char    *optname_local_dbname = pg_malloc(strlen(name) + 30);
-       BdrConnectionConfig *opts;
-
-       sprintf(optname_dsn, "bdr.%s_dsn", name);
-       sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
-       sprintf(optname_replica, "bdr.%s_init_replica", name);
-       sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
-
-       opts = pg_malloc0(sizeof(BdrConnectionConfig));
-       opts->name = pg_strdup(name);
-       opts->is_valid = false;
-
-       bdr_connection_configs[connection_config_idx] = opts;
-
-       opts->dsn = get_postgres_guc_value(optname_dsn, NULL);
-       if (!opts->dsn)
-           continue;
-
-       opts->replica_local_dsn = get_postgres_guc_value(optname_local_dsn, NULL);
-
-       if (!parse_bool(get_postgres_guc_value(optname_replica, "false"), &opts->init_replica))
-           die(_("Invalid boolean value for configuration parameter \"%s\"\n"), optname_replica);
-
-       opts->dbname = get_postgres_guc_value(optname_local_dbname, NULL);
-
-       options = PQconninfoParse(opts->dsn, &errormsg);
-       if (errormsg != NULL)
-       {
-           char *str = pg_strdup(errormsg);
-
-           PQfreemem(errormsg);
-           die(_("bdr %s: error in dsn: %s\n"), name, str);
-       }
-
-       if (opts->dbname == NULL)
-       {
-           cur_option = options;
-           while (cur_option->keyword != NULL)
-           {
-               if (strcmp(cur_option->keyword, "dbname") == 0)
-               {
-                   if (cur_option->val == NULL)
-                       die(_("bdr %s: no dbname set\n"), name);
-
-                   opts->dbname = pg_strdup(cur_option->val);
-               }
-               cur_option++;
-           }
-       }
-
-
-       opts->is_valid = true;
-
-       /* cleanup */
-       PQconninfoFree(options);
-   }
-}
-
-
-
  /*
   * Cleans everything that was replicated via basebackup but we don't want it.
   */
  static void
-remove_unwanted_state(void)
+remove_unwanted_files(void)
  {
  #ifdef BUILDING_BDR
     DIR             *lldir;
@@ -535,6 +632,9 @@ remove_unwanted_state(void)
  
     printfPQExpBuffer(llpath, "%s/%s", data_dir, LLOGCDIR);
  
+   print_msg(VERBOSITY_DEBUG, _("Removing data from \"%s\" directory.\n"),
+             llpath->data);
+
     /*
      * Remove stray logical replication checkpoints
      */
@@ -577,121 +677,123 @@ remove_unwanted_state(void)
  #endif
  }
  
-
  /*
- * Initialize replication slots
+ * Init the datadir
   *
- * Get connection configs from bdr and use the info
- * to register replication slots for future use.
+ * This function can either ensure provided datadir is a postgres datadir,
+ * or create it using pg_basebackup.
+ *
+ * In any case, new postresql.conf and pg_hba.conf will be copied to the
+ * datadir if they are provided.
   */
  static void
-initialize_replication_slots(bool init_replica)
+initialize_data_dir(char *data_dir, char *connstr,
+                   char *postgresql_conf, char *pg_hba_conf)
  {
-   int      i;
-
-   for (i = 0; i < bdr_connection_config_count; i++)
+   /* Run basebackup as needed. */
+   switch (pg_check_dir(data_dir))
     {
-       NameData     slot_name;
-       char         remote_ident[256];
-       RemoteInfo  *ri;
-       TimeLineID   tlid;
-       Oid          dboid;
-       char         system_identifier_s[32];
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-       PQExpBuffer      conninfo = createPQExpBuffer();
-
-       if (!cfg || !cfg->is_valid || cfg->init_replica != init_replica)
-           continue;
-
-       printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
-       remote_conn = PQconnectdb(conninfo->data);
-       destroyPQExpBuffer(conninfo);
-
-       if (PQstatus(remote_conn) != CONNECTION_OK)
-       {
-           die(_("Could not connect to the remote server: %s\n"),
-                       PQerrorMessage(remote_conn));
-       }
-
-       ri = get_remote_info(remote_conn, cfg->dsn);
-       dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
-
-       /* XXX: this might break if timeline switch happens in meantime */
-       tlid = cfg->init_replica ? ri->tlid + 1 : ri->tlid;
-
-       snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, system_identifier);
-       snprintf(NameStr(slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
-                ri->dboid, system_identifier_s, tlid,
-                dboid, "");
-       NameStr(slot_name)[NAMEDATALEN - 1] = '\0';
-
-       create_replication_slot(remote_conn, &slot_name);
+       case 0:     /*Does not exist */
+       case 1:     /* Exists, empty */
+           {
+               if (connstr)
+               {
+                   print_msg(VERBOSITY_NORMAL,
+                             _("Creating base backup of the remote node...\n"));
+                   run_basebackup(connstr, data_dir);
+               }
+               else
+                   die(_("Directory \"%s\" does not exist.\n"),
+                       data_dir);
+               break;
+           }
+       case 2:
+       case 3:     /* Exists, not empty */
+       case 4:
+           {
+               if (!is_pg_dir(data_dir))
+                   die(_("Directory \"%s\" exists but is not valid postgres data directory.\n"),
+                       data_dir);
+               break;
+           }
+       case -1:    /* Access problem */
+           die(_("Could not access directory \"%s\": %s.\n"),
+               data_dir, strerror(errno));
+   }
  
-       PQfinish(remote_conn);
-       remote_conn = NULL;
+   remove_unwanted_files();
  
-       snprintf(remote_ident, sizeof(remote_ident),
-               BDR_NODE_ID_FORMAT,
-               ri->sysid, ri->tlid, ri->dboid, dboid,
-               "");
-   }
+   if (postgresql_conf)
+       CopyConfFile(postgresql_conf, "postgresql.conf");
+   if (pg_hba_conf)
+       CopyConfFile(pg_hba_conf, "pg_hba.conf");
  }
  
  /*
- * Get database Oid of the remotedb.
+ * Initialize replication slots
   *
- * Can't use the bdr_get_remote_dboid because it needs elog :(
+ * Get connection configs from bdr and use the info
+ * to register replication slots for future use.
   */
-static Oid
-get_remote_dboid(char *conninfo_db)
+static void
+initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid)
  {
-   PGconn     *dbConn;
+   char        slotname[NAMEDATALEN];
+   char        system_identifier_s[32];
+   PQExpBuffer query = createPQExpBuffer();
     PGresult   *res;
-   char       *remote_dboid;
-   Oid         remote_dboid_i;
-
-   dbConn = PQconnectdb(conninfo_db);
-   if (PQstatus(dbConn) != CONNECTION_OK)
-   {
-       die(_("Could not connect to the primary server: %s"), PQerrorMessage(dbConn));
-   }
  
-   res = PQexec(dbConn, "SELECT oid FROM pg_database WHERE datname = current_database()");
-   if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       die(_("Could fetch database oid: %s"), PQerrorMessage(dbConn));
+   snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, ni->local_sysid);
+   snprintf(slotname, NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+            dboid, system_identifier_s, ni->local_tlid, dboid, "");
+   appendPQExpBuffer(query, "SELECT pg_create_logical_replication_slot(%s, '%s');",
+                     PQescapeLiteral(conn, slotname, NAMEDATALEN), "bdr");
  
-   if (PQntuples(res) != 1 || PQnfields(res) != 1)
-       die(_("Could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"),
-            PQntuples(res), PQnfields(res), 1, 1);
+   res = PQexec(conn, query->data);
  
-   remote_dboid = PQgetvalue(res, 0, 0);
-   if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
-       die(_("could not parse remote database OID %s"), remote_dboid);
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       die(_("Could not create replication slot, status %s: %s\n"),
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+   }
  
     PQclear(res);
-   PQfinish(dbConn);
-
-   return remote_dboid_i;
+   destroyPQExpBuffer(query);
  }
  
  /*
   * Read replication info about remote connection
   */
  static RemoteInfo *
-get_remote_info(PGconn *conn, char* aux_connstr)
+get_remote_info(char* remote_connstr)
  {
-   RemoteInfo  *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
+   RemoteInfo *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
     char       *remote_sysid;
     char       *remote_tlid;
-   char       *remote_dboid;
+   int         i;
     PGresult   *res;
+   PQExpBuffer conninfo = createPQExpBuffer();
+
+   /*
+    * Fetch the system identification info (sysid, tlid) via replication
+    * connection - there is no way to get this info via SQL.
+    */
+   printfPQExpBuffer(conninfo, "%s replication=database", remote_connstr);
+   remote_conn = PQconnectdb(conninfo->data);
+   destroyPQExpBuffer(conninfo);
+
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+   {
+       die(_("Could not connect to the remote server: %s\n"),
+                   PQerrorMessage(remote_conn));
+   }
  
-   res = PQexec(conn, "IDENTIFY_SYSTEM");
+   res = PQexec(remote_conn, "IDENTIFY_SYSTEM");
     if (PQresultStatus(res) != PGRES_TUPLES_OK)
     {
         PQclear(res);
         die(_("Could not send replication command \"%s\": %s\n"),
-            "IDENTIFY_SYSTEM", PQerrorMessage(conn));
+            "IDENTIFY_SYSTEM", PQerrorMessage(remote_conn));
     }
  
     if (PQntuples(res) != 1 || PQnfields(res) < 4 || PQnfields(res) > 5)
@@ -704,17 +806,6 @@ get_remote_info(PGconn *conn, char* aux_connstr)
     remote_sysid = PQgetvalue(res, 0, 0);
     remote_tlid = PQgetvalue(res, 0, 1);
  
-   if (PQnfields(res) == 5)
-   {
-       remote_dboid = PQgetvalue(res, 0, 4);
-       if (sscanf(remote_dboid, "%u", &ri->dboid) != 1)
-           die(_("could not parse remote database OID %s"), remote_dboid);
-   }
-   else
-   {
-       ri->dboid = get_remote_dboid(aux_connstr);
-   }
-
  #ifdef HAVE_STRTOULL
     ri->sysid = strtoull(remote_sysid, NULL, 10);
  #else
@@ -725,97 +816,97 @@ get_remote_info(PGconn *conn, char* aux_connstr)
         die(_("Could not parse remote tlid %s\n"), remote_tlid);
  
     PQclear(res);
+   PQfinish(remote_conn);
+   remote_conn = NULL;
  
-   return ri;
-}
+   /*
+    * Fetch list of BDR enabled databases via standard SQL connection.
+    */
+   remote_conn = PQconnectdb(remote_connstr);
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+   {
+       die(_("Could not connect to the remote server: %s"), PQerrorMessage(remote_conn));
+   }
  
-/*
- * Get dboid based on dbname
- */
-static Oid
-get_dboid_from_dbname(PGconn *conn, const char* dbname)
-{
-   char        *dboid_str;
-   Oid          dboid;
-   PQExpBuffer  query = createPQExpBuffer();
-   PGresult    *res;
+   res = PQexec(remote_conn, "SELECT d.oid, d.datname "
+                "FROM pg_catalog.pg_database d, pg_catalog.pg_shseclabel l "
+                "WHERE l.provider = 'bdr' "
+                "  AND l.classoid = 'pg_database'::regclass "
+                "  AND d.oid = l.objoid;");
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       die(_("Could fetch remote database list: %s"), PQerrorMessage(remote_conn));
  
-   appendPQExpBuffer(query, "SELECT oid FROM pg_catalog.pg_database WHERE datname = '%s'",
-                    dbname);
+   ri->numdbs = PQntuples(res);
+   ri->dboids = (Oid *) pg_malloc(ri->numdbs * sizeof(Oid));
+   ri->dbnames = (char **) pg_malloc(ri->numdbs * sizeof(char *));
  
-   res = PQexec(conn, query->data);
-   if (PQresultStatus(res) != PGRES_TUPLES_OK || PQntuples(res) != 1)
+   for (i = 0; i < ri->numdbs; i++)
     {
-       PQclear(res);
-       die(_("Could not get database id for \"%s\": %s\n"),
-            dbname, PQerrorMessage(conn));
-   }
+       char   *remote_dboid = PQgetvalue(res, i, 0);
+       char   *remote_dbname = PQgetvalue(res, i, 1);
+       Oid     remote_dboid_i;
  
-   dboid_str = PQgetvalue(res, 0, 0);
-   if (sscanf(dboid_str, "%u", &dboid) != 1)
-       die(_("Could not parse database OID %s\n"), dboid_str);
+       if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
+           die(_("Could not parse database OID %s"), remote_dboid);
+
+       ri->dboids[i] = remote_dboid_i;
+       ri->dbnames[i] = pstrdup(remote_dbname);
+   }
  
     PQclear(res);
-   destroyPQExpBuffer(query);
  
-   return dboid;
+   PQfinish(remote_conn);
+   remote_conn = NULL;
+
+   return ri;
  }
  
+
  /*
- * Create replication slot
+ * Check if extension exists.
   */
-static void
-create_replication_slot(PGconn *conn, Name slot_name)
+static bool
+extension_exists(PGconn *conn, const char *extname)
  {
-   PQExpBuffer query = createPQExpBuffer();
-   PGresult   *res;
-
-   appendPQExpBuffer(query, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
-                    NameStr(*slot_name), "bdr");
+   PQExpBuffer     query = createPQExpBuffer();
+   PGresult       *res;
+   bool            ret;
  
+   printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
+                     PQescapeLiteral(conn, extname, strlen(extname)));
     res = PQexec(conn, query->data);
  
     if (PQresultStatus(res) != PGRES_TUPLES_OK)
     {
-       die(_("Could not send replication command \"%s\": status %s: %s\n"),
-            query->data,
-            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       PQclear(res);
+       die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
     }
  
+   ret = PQntuples(res) == 1;
+
     PQclear(res);
     destroyPQExpBuffer(query);
+
+   return ret;
  }
  
-#ifdef BUILDING_UDR
+/*
+ * Create extension.
+ */
  static void
-install_extension_if_not_exists(PGconn *conn, const char *extname)
+install_extension(PGconn *conn, const char *extname)
  {
     PQExpBuffer     query = createPQExpBuffer();
     PGresult       *res;
  
-   printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
-                     PQescapeLiteral(conn, extname, strlen(extname)));
+   printfPQExpBuffer(query, "CREATE EXTENSION %s;",
+                     PQescapeIdentifier(conn, extname, strlen(extname)));
     res = PQexec(conn, query->data);
  
-   if (PQresultStatus(res) != PGRES_TUPLES_OK)
-   {
-       PQclear(res);
-       die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
-   }
-
-   if (PQntuples(res) != 1)
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
     {
         PQclear(res);
-
-       printfPQExpBuffer(query, "CREATE EXTENSION %s;",
-                         PQescapeIdentifier(conn, extname, strlen(extname)));
-       res = PQexec(conn, query->data);
-
-       if (PQresultStatus(res) != PGRES_COMMAND_OK)
-       {
-           PQclear(res);
-           die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
-       }
+       die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
     }
  
     PQclear(res);
@@ -823,81 +914,104 @@ install_extension_if_not_exists(PGconn *conn, const char *extname)
  }
  
  /*
- * Initialize bdr extension (if not already initialized).
- *
- * Should have similar logic as bdr_maintain_schema in bdr.c.
+ * Validate that BDR extension is installed on remote node
+ * and that there is at least one BDR node entry present.
   */
  static void
-initialize_bdr(PGconn *conn)
+validate_remote_node(PGconn *conn)
  {
-   install_extension_if_not_exists(conn, "btree_gist");
-   install_extension_if_not_exists(conn,"bdr");
-}
-#endif
+   PGresult   *res;
+   PQExpBuffer query = createPQExpBuffer();
  
-/*
- * Initialize new remote identifiers to specific position.
- */
-static void
-initialize_replication_identifiers(char *remote_lsn)
-{
-   int              i;
-   PGresult        *res;
+   if (!extension_exists(conn, "bdr"))
+       die(_("The BDR extension must be installed on remote node.\n"));
  
-   /* Remove replication identifiers */
-   res = PQexec(local_conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+#ifdef BUILDING_BDR
+   res = PQexec(conn, "SELECT 1 FROM bdr.bdr_nodes;");
     if (PQresultStatus(res) != PGRES_TUPLES_OK)
     {
         PQclear(res);
-       die(_("Could not remove replication identifier: %s\n"), PQerrorMessage(local_conn));
+       die(_("Could fetch BDR info: %s\n"), PQerrorMessage(conn));
     }
  
-   /* Initialize new replication identifiers */
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       char        remote_ident[256];
-       Oid         dboid;
-       RemoteInfo  *ri;
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-       PQExpBuffer conninfo = createPQExpBuffer();
+   if (PQntuples(res) < 1)
+       die(_("The remote node is not configured as a BDR node.\n"));
  
-       if (!cfg || !cfg->is_valid)
-           continue;
+   PQclear(res);
+#endif
  
-       printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
-       remote_conn = PQconnectdb(conninfo->data);
-       destroyPQExpBuffer(conninfo);
+   destroyPQExpBuffer(query);
+}
  
-       if (PQstatus(remote_conn) != CONNECTION_OK)
-       {
-           die(_("Could not connect to the remote server: %s\n"),
-                       PQerrorMessage(remote_conn));
-       }
  
-       ri = get_remote_info(remote_conn, cfg->dsn);
-       dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
+/*
+ * Insert node entry for local node to the remote's bdr_nodes.
+ */
+void
+initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+                     char *remote_connstr)
+{
+   PQExpBuffer     query = createPQExpBuffer();
+   PGresult       *res;
  
-       PQfinish(remote_conn);
-       remote_conn = NULL;
+   printfPQExpBuffer(query, "INSERT INTO bdr.bdr_nodes"
+                            " (node_status, node_sysid, node_timeline,"
+                            "  node_dboid, node_init_from_dsn)"
+                            " VALUES ('c', '"UINT64_FORMAT"', %u, %u, %s);",
+                     ni->local_sysid, ni->local_tlid, dboid,
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+   res = PQexec(conn, query->data);
+
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       PQclear(res);
+       die(_("Failed to insert row into bdr.bdr_nodes: %s\n"), PQerrorMessage(conn));
+   }
+
+   PQclear(res);
+   destroyPQExpBuffer(query);
+}
+
+/*
+ * Clean all the data that was copied from remote node but we don't
+ * want it here (currently shared security labels and replication identifiers).
+ */
+static void
+remove_unwanted_data(PGconn *conn, char *dbname)
+{
+   PGresult       *res;
  
-       snprintf(remote_ident, sizeof(remote_ident),
-               BDR_NODE_ID_FORMAT,
-               ri->sysid, ri->tlid, ri->dboid, dboid,
-               "");
+   /* Remove any BDR security labels. */
+   res = PQexec(conn, "DELETE FROM pg_catalog.pg_shseclabel WHERE provider = 'bdr';");
  
-       create_replication_identifier(local_conn, remote_ident,
-                                     cfg->init_replica ? remote_lsn : NULL);
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       PQclear(res);
+       die(_("Could not update security label: %s\n"), PQerrorMessage(conn));
+   }
+
+   /* Remove replication identifiers. */
+   res = PQexec(conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       PQclear(res);
+       die(_("Could not remove existing replication identifiers: %s\n"), PQerrorMessage(conn));
     }
+   PQclear(res);
  }
  
  /*
- * Create local replication identifier
+ * Initialize new remote identifier to specific position.
   */
  static void
-create_replication_identifier(PGconn *conn, const char *remote_ident, char *remote_lsn)
+initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn)
  {
-   PQExpBuffer query = createPQExpBuffer();
     PGresult   *res;
+   char        remote_ident[256];
+   PQExpBuffer query = createPQExpBuffer();
+
+   snprintf(remote_ident, sizeof(remote_ident), BDR_NODE_ID_FORMAT,
+               ni->remote_sysid, ni->remote_tlid, dboid, dboid, "");
  
     printfPQExpBuffer(query, "SELECT "RIINTERFACE_PREFIX"replication_identifier_create('%s')",
                      remote_ident);
@@ -937,76 +1051,70 @@ create_replication_identifier(PGconn *conn, const char *remote_ident, char *remo
   * state through physical replay.
   */
  static char *
-create_restore_point(char *remote_connstr)
+create_restore_point(PGconn *conn, char *restore_point_name)
  {
     PQExpBuffer  query = createPQExpBuffer();
     PGresult    *res;
     char        *remote_lsn = NULL;
  
-   remote_conn = PQconnectdb(remote_connstr);
-   if (PQstatus(remote_conn) != CONNECTION_OK)
-   {
-       die(_("Could not connect to the remote server: %s\n"),
-                   PQerrorMessage(remote_conn));
-   }
-
-   printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", NameStr(restore_point_name));
-   res = PQexec(remote_conn, query->data);
+   printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", restore_point_name);
+   res = PQexec(conn, query->data);
     if (PQresultStatus(res) != PGRES_TUPLES_OK)
     {
-       die(_("Could not create restore point \"%s\": status %s: %s\n"),
-            query->data,
+       die(_("Could not create restore point, status %s: %s\n"),
              PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
     }
     remote_lsn = pstrdup(PQgetvalue(res, 0, 0));
  
     PQclear(res);
-   PQfinish(remote_conn);
-   remote_conn = NULL;
     destroyPQExpBuffer(query);
  
     return remote_lsn;
  }
  
-static char *
-detect_local_conninfo(void)
-{
-   int i;
  
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-
-       if (!cfg || !cfg->is_valid || !cfg->init_replica ||
-           !cfg->replica_local_dsn)
-           continue;
-
-       return pg_strdup(cfg->replica_local_dsn);
-   }
-
-   return NULL;
-}
-
-static char *
-detect_remote_conninfo(void)
+static void
+bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr)
  {
-   int i;
+   PQExpBuffer  query = createPQExpBuffer();
+   PGresult    *res;
  
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
+   /* Install required extensions if needed. */
+   if (!extension_exists(conn, "btree_gist"))
+       install_extension(conn, "btree_gist");
+   if (!extension_exists(conn, "bdr"))
+       install_extension(conn, "bdr");
  
-       if (!cfg || !cfg->is_valid || !cfg->init_replica)
-           continue;
+   /* Add the node to the cluster. */
+#ifdef BUILDING_BDR
+   printfPQExpBuffer(query, "SELECT bdr.bdr_group_join(%s, %s);",
+                     PQescapeLiteral(conn, local_connstr, strlen(local_connstr)),
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+#else
+   printfPQExpBuffer(query, "SELECT bdr.bdr_subscribe(%s, %s);",
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)),
+                     PQescapeLiteral(conn, local_connstr, strlen(local_connstr)));
+#endif
  
-       return pg_strdup(cfg->dsn);
+   res = PQexec(conn, query->data);
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       die(_("Could not add local node to cluster, status %s: %s\n"),
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
     }
  
-   return NULL;
+   PQclear(res);
+   destroyPQExpBuffer(query);
  }
  
+/*
+ * Build connection string from individual parameter.
+ *
+ * This function also handles case where full connection string was
+ * specified instead of dbname.
+ */
  char *
-get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
+get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser)
  {
     char        *ret;
     int         argcount = 4;   /* dbname, host, user, port */
@@ -1053,10 +1161,6 @@ get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
     {
         keywords = pg_malloc0((argcount + 1) * sizeof(*keywords));
         values = pg_malloc0((argcount + 1) * sizeof(*values));
-
-       keywords[i] = "dbname";
-       values[i] = dbname == NULL ? "postgres" : dbname;
-       i++;
     }
  
     if (dbhost)
@@ -1078,7 +1182,7 @@ get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
         i++;
     }
  
-   ret = PQconninfoParams_to_conninfo(keywords, values);
+   ret = PQconninfoParamsToConnstr(keywords, values);
  
     /* Connection ok! */
     pg_free(values);
@@ -1137,22 +1241,38 @@ WriteRecoveryConf(PQExpBuffer contents)
     fclose(cf);
  }
  
+/*
+ * Copy file to data
+ */
+static void
+CopyConfFile(char *fromfile, char *tofile)
+{
+   char        filename[MAXPGPATH];
+
+   sprintf(filename, "%s/%s", data_dir, tofile);
+
+   print_msg(VERBOSITY_DEBUG, _("Copying \"%s\" to \"%s\".\n"),
+             fromfile, filename);
+   copy_file(fromfile, filename);
+}
+
+
  /*
   * Convert PQconninfoOption array into conninfo string
   */
  static char *
-PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values)
+PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values)
  {
     PQExpBuffer  retbuf = createPQExpBuffer();
     char        *ret;
     int          i = 0;
  
-   while (keywords[i])
+   for (i = 0; keywords[i] != NULL; i++)
     {
-       char *tmpval = escapeConninfoValue(values[i]);
-       appendPQExpBuffer(retbuf, "%s = '%s' ", keywords[i], tmpval);
-       pg_free(tmpval);
-       i++;
+       if (i > 0)
+           appendPQExpBufferChar(retbuf, ' ');
+       appendPQExpBuffer(retbuf, "%s=", keywords[i]);
+       appendPQExpBufferConnstrValue(retbuf, values[i]);
     }
  
     ret = pg_strdup(retbuf->data);
@@ -1164,371 +1284,130 @@ PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * v
  /*
   * Escape connection info value
   */
-static char *
-escapeConninfoValue(const char *val)
+static void
+appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str)
  {
-   int i, j;
-   char *ret = pg_malloc(strlen(val) * 2 + 1);
+   const char *s;
+   bool        needquotes;
  
-   j = 0;
-   for (i = 0; i < strlen(val); i++)
+   /*
+    * If the string consists entirely of plain ASCII characters, no need to
+    * quote it. This is quite conservative, but better safe than sorry.
+    */
+   needquotes = false;
+   for (s = str; *s; s++)
     {
-       switch (val[i])
+       if (!((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z') ||
+             (*s >= '0' && *s <= '9') || *s == '_' || *s == '.'))
         {
-           case '\\':
-           case '\'':
-               ret[j++] = '\\';
-           default:
-               break;
+           needquotes = true;
+           break;
         }
-
-       ret[j++] = val[i];
     }
  
-   ret[j] = '\0';
-
-   return ret;
-}
-
-
-/*
- * Taken from adt/bool.c
- *
- * Try to interpret value as boolean value.  Valid values are: true,
- * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof.
- * If the string parses okay, return true, else false.
- * If okay and result is not NULL, return the value in *result.
- */
-static bool
-parse_bool(const char *value, bool *result)
-{
-   return parse_bool_with_len(value, strlen(value), result);
-}
-
-static bool
-parse_bool_with_len(const char *value, size_t len, bool *result)
-{
-   switch (*value)
+   if (needquotes)
     {
-       case 't':
-       case 'T':
-           if (pg_strncasecmp(value, "true", len) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case 'f':
-       case 'F':
-           if (pg_strncasecmp(value, "false", len) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case 'y':
-       case 'Y':
-           if (pg_strncasecmp(value, "yes", len) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case 'n':
-       case 'N':
-           if (pg_strncasecmp(value, "no", len) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case 'o':
-       case 'O':
-           /* 'o' is not unique enough */
-           if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case '1':
-           if (len == 1)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case '0':
-           if (len == 1)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       default:
-           break;
-   }
-
-   if (result)
-       *result = false;        /* suppress compiler warning */
-   return false;
-}
-
-/*
- * Remove leading and trailing whitespace from the string,
- * does not change input
- */
-static char *
-trimwhitespace(const char *str)
-{
-   const char *end;
-   char *res;
-   size_t len;
-
-   while(isspace(*str))
-       str++;
-
-   if(*str == 0)
-       return NULL;
-
-   end = str + strlen(str) - 1;
-   while(end > str && isspace(*end))
-       end--;
-
-   len = end-str;
-   if (!len)
-       return NULL;
-
-   len++;
-   res = pg_malloc(len+1);
-   memcpy(res, str, len);
-   res[len] = '\0';
-
-   return res;
-}
-
-/*
- * Split guc list paramenter into array
- * Note that this is not 100% compatible with that is in core
- * but seems good enough for our purposes
- */
-static char    **
-split_list_guc(char *str, size_t *count)
-{
-   char    **ret = NULL;
-   char     *t = strtok (str, ",");
-   size_t    i = 0;
-
-   while (t) {
-       ret = realloc(ret, sizeof(char*)* ++i);
-
-       if (ret == NULL)
-           die(_("Out of memory\n"));
-
-       t = trimwhitespace(t);
-       if (!t)
-           die(_("Bad input for list: %s\n"), str);
-
-       ret[i-1] = t;
+       appendPQExpBufferChar(buf, '\'');
+       while (*str)
+       {
+           /* ' and \ must be escaped by to \' and \\ */
+           if (*str == '\'' || *str == '\\')
+               appendPQExpBufferChar(buf, '\\');
  
-       t = strtok(NULL, ",");
+           appendPQExpBufferChar(buf, *str);
+           str++;
+       }
+       appendPQExpBufferChar(buf, '\'');
     }
-
-   *count = i;
-   return ret;
+   else
+       appendPQExpBufferStr(buf, str);
  }
  
  
  /*
   * Find the pgport and try a connection
- *
- * Based on pg_ctl.c:test_postmaster_connection
   */
-static bool
-wait_postmaster_connection(void)
+static void
+wait_postmaster_connection(const char *connstr)
  {
     PGPing      res;
-   long        pm_pid = 0;
-   char        connstr[MAXPGPATH * 2 + 256];
+   long        pmpid = 0;
  
-   connstr[0] = '\0';
+   print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to accept connections ...");
  
+   /* First wait for Postmaster to come up. */
     for (;;)
     {
-       /* Do we need a connection string? */
-       if (connstr[0] == '\0')
-       {
-           /*----------
-            * The number of lines in postmaster.pid tells us several things:
-            *
-            * # of lines
-            *      0   lock file created but status not written
-            *      2   pre-9.1 server, shared memory not created
-            *      3   pre-9.1 server, shared memory created
-            *      5   9.1+ server, ports not opened
-            *      6   9.1+ server, shared memory not created
-            *      7   9.1+ server, shared memory created
-            *
-            * If we see less than 6 lines in postmaster.pid, just keep
-            * waiting.
-            *----------
-            */
-           char      **optlines;
-
-           /* Try to read the postmaster.pid file */
-           if ((optlines = readfile(pid_file)) != NULL &&
-               optlines[0] != NULL &&
-               optlines[1] != NULL &&
-               optlines[2] != NULL &&
-               optlines[3] != NULL &&
-               optlines[4] != NULL &&
-               optlines[5] != NULL)
-           {
-               /* File is complete enough for us, parse it */
-               long        pmpid;
-               time_t      pmstart;
-
-               /*
-                * Make sanity checks.  If it's for a standalone backend
-                * (negative PID), or the recorded start time is before
-                * pg_ctl started, then either we are looking at the wrong
-                * data directory, or this is a pre-existing pidfile that
-                * hasn't (yet?) been overwritten by our child postmaster.
-                * Allow 2 seconds slop for possible cross-process clock
-                * skew.
-                */
-               pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
-               pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
-               if (pmpid > 0 || pmstart > start_time - 3)
-               {
-                   /*
-                    * OK, seems to be a valid pidfile from our child.
-                    */
-                   int         portnum;
-                   char       *sockdir;
-                   char       *hostaddr;
-                   char        host_str[MAXPGPATH];
-
-                   pm_pid = pmpid;
-
-                   /*
-                    * Extract port number and host string to use. Prefer
-                    * using Unix socket if available.
-                    */
-                   portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
-                   sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
-                   hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
-
-                   /*
-                    * While unix_socket_directories can accept relative
-                    * directories, libpq's host parameter must have a
-                    * leading slash to indicate a socket directory.  So,
-                    * ignore sockdir if it's relative, and try to use TCP
-                    * instead.
-                    */
-                   if (sockdir[0] == '/')
-                       strlcpy(host_str, sockdir, sizeof(host_str));
-                   else
-                       strlcpy(host_str, hostaddr, sizeof(host_str));
-
-                   /* remove trailing newline */
-                   if (strchr(host_str, '\n') != NULL)
-                       *strchr(host_str, '\n') = '\0';
-
-                   /* Fail if couldn't get either sockdir or host addr */
-                   if (host_str[0] == '\0')
-                   {
-                       fprintf(stderr, _("Relative socket directory is not supported\n"));
-                       return false;
-                   }
-
-                   /* If postmaster is listening on "*", use localhost */
-                   if (strcmp(host_str, "*") == 0)
-                       strcpy(host_str, "localhost");
-
-                   /*
-                    * We need to set connect_timeout otherwise on Windows
-                    * the Service Control Manager (SCM) will probably
-                    * timeout first.
-                    */
-                   snprintf(connstr, sizeof(connstr),
-                   "dbname=postgres port=%d host='%s' connect_timeout=5",
-                            portnum, host_str);
-               }
-           }
+       if ((pmpid = get_pgpid()) != 0 &&
+           postmaster_is_alive((pid_t) pmpid))
+           break;
  
-           /*
-            * Free the results of readfile.
-            *
-            * This is safe to call even if optlines is NULL.
-            */
-           free_readfile(optlines);
-       }
+       pg_usleep(1000000);     /* 1 sec */
+       print_msg(VERBOSITY_VERBOSE, ".");
+   }
  
-       /* If we have a connection string, ping the server */
-       if (connstr[0] != '\0')
-       {
-           res = PQping(connstr);
-           if (res == PQPING_OK)
-           {
-               break;
-           }
-           else if (res == PQPING_NO_ATTEMPT)
-               return false;
-       }
+   /* Now wait for Postmaster to either accept connections or die. */
+   for (;;)
+   {
+       res = PQping(connstr);
+       if (res == PQPING_OK)
+           break;
+       else if (res == PQPING_NO_ATTEMPT)
+           break;
  
         /*
-        * If we've been able to identify the child postmaster's PID, check
-        * the process is still alive.  This covers cases where the postmaster
-        * successfully created the pidfile but then crashed without removing
-        * it.
+        * Check if the process is still alive. This covers cases where the
+        * postmaster successfully created the pidfile but then crashed without
+        * removing it.
          */
-       if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
-           return false;
+       if (!postmaster_is_alive((pid_t) pmpid))
+           break;
  
-       /* No response, or startup still in process; wait */
+       /* No response; wait */
         pg_usleep(1000000);     /* 1 sec */
-       print_msg(".");
+       print_msg(VERBOSITY_VERBOSE, ".");
     }
  
-   return true;
+   print_msg(VERBOSITY_VERBOSE, "\n");
  }
  
  /*
   * Wait for postmaster to die
   */
  static void
-wait_postgres_shutdown(void)
+wait_postmaster_shutdown(void)
  {
     long pid;
  
+   print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to shutdown ...");
+
     for (;;)
     {
         if ((pid = get_pgpid()) != 0)
         {
             pg_usleep(1000000);     /* 1 sec */
-           print_msg(".");
+           print_msg(VERBOSITY_NORMAL, ".");
         }
         else
             break;
     }
+
+   print_msg(VERBOSITY_VERBOSE, "\n");
+}
+
+static bool
+file_exists(const char *path)
+{
+   struct stat statbuf;
+
+   if (stat(path, &statbuf) != 0)
+       return false;
+
+   return true;
  }
  
  static bool
-is_pg_dir(char *path)
+is_pg_dir(const char *path)
  {
     struct stat statbuf;
     char        version_file[MAXPGPATH];
@@ -1545,6 +1424,63 @@ is_pg_dir(char *path)
     return true;
  }
  
+/*
+ * copy one file
+ */
+static void
+copy_file(char *fromfile, char *tofile)
+{
+   char       *buffer;
+   int         srcfd;
+   int         dstfd;
+   int         nbytes;
+   off_t       offset;
+
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+   buffer = malloc(COPY_BUF_SIZE);
+
+   /*
+    * Open the files
+    */
+   srcfd = open(fromfile, O_RDONLY | PG_BINARY, 0);
+   if (srcfd < 0)
+       die(_("could not open file \"%s\""), fromfile);
+
+   dstfd = open(tofile, O_RDWR | PG_BINARY,
+                             S_IRUSR | S_IWUSR);
+   if (dstfd < 0)
+       die(_("could not create file \"%s\""), tofile);
+
+   /*
+    * Do the data copying.
+    */
+   for (offset = 0;; offset += nbytes)
+   {
+       nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+       if (nbytes < 0)
+           die(_("could not read file \"%s\""), fromfile);
+       if (nbytes == 0)
+           break;
+       errno = 0;
+       if ((int) write(dstfd, buffer, nbytes) != nbytes)
+       {
+           /* if write didn't set errno, assume problem is no disk space */
+           if (errno == 0)
+               errno = ENOSPC;
+           die(_("could not write to file \"%s\""), tofile);
+       }
+   }
+
+   if (close(dstfd))
+       die(_("could not close file \"%s\""), tofile);
+
+   /* we don't care about errors here */
+   close(srcfd);
+
+   free(buffer);
+}
+
  /*
   * Utility functions taken from pg_ctl
   */
@@ -1622,112 +1558,3 @@ get_pgpid(void)
     fclose(pidf);
     return pid;
  }
-
-/*
- * get the lines from a text file - return NULL if file can't be opened
- */
-static char **
-readfile(const char *path)
-{
-   int         fd;
-   int         nlines;
-   char      **result;
-   char       *buffer;
-   char       *linebegin;
-   int         i;
-   int         n;
-   int         len;
-   struct stat statbuf;
-
-   /*
-    * Slurp the file into memory.
-    *
-    * The file can change concurrently, so we read the whole file into memory
-    * with a single read() call. That's not guaranteed to get an atomic
-    * snapshot, but in practice, for a small file, it's close enough for the
-    * current use.
-    */
-   fd = open(path, O_RDONLY | PG_BINARY, 0);
-   if (fd < 0)
-       return NULL;
-   if (fstat(fd, &statbuf) < 0)
-   {
-       close(fd);
-       return NULL;
-   }
-   if (statbuf.st_size == 0)
-   {
-       /* empty file */
-       close(fd);
-       result = (char **) pg_malloc(sizeof(char *));
-       *result = NULL;
-       return result;
-   }
-   buffer = pg_malloc(statbuf.st_size + 1);
-
-   len = read(fd, buffer, statbuf.st_size + 1);
-   close(fd);
-   if (len != statbuf.st_size)
-   {
-       /* oops, the file size changed between fstat and read */
-       free(buffer);
-       return NULL;
-   }
-
-   /*
-    * Count newlines. We expect there to be a newline after each full line,
-    * including one at the end of file. If there isn't a newline at the end,
-    * any characters after the last newline will be ignored.
-    */
-   nlines = 0;
-   for (i = 0; i < len; i++)
-   {
-       if (buffer[i] == '\n')
-           nlines++;
-   }
-
-   /* set up the result buffer */
-   result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
-
-   /* now split the buffer into lines */
-   linebegin = buffer;
-   n = 0;
-   for (i = 0; i < len; i++)
-   {
-       if (buffer[i] == '\n')
-       {
-           int         slen = &buffer[i] - linebegin + 1;
-           char       *linebuf = pg_malloc(slen + 1);
-
-           memcpy(linebuf, linebegin, slen);
-           linebuf[slen] = '\0';
-           result[n++] = linebuf;
-           linebegin = &buffer[i + 1];
-       }
-   }
-   result[n] = NULL;
-
-   free(buffer);
-
-   return result;
-}
-
-/*
- * Free memory allocated for optlines through readfile()
- */
-void
-free_readfile(char **optlines)
-{
-   char       *curr_line = NULL;
-   int         i = 0;
-
-   if (!optlines)
-       return;
-
-   while ((curr_line = optlines[i++]))
-       free(curr_line);
-
-   free(optlines);
-
-   return;
-}
diff --git a/bdr_init_replica.c b/bdr_init_replica.c

index b5a5931bd612395040c7604e0f113f44677267eb..36e50c1cf15ff4e4a74f0c90deabd819b3b79ee7 100644 (file)
--- a/bdr_init_replica.c
+++ b/bdr_init_replica.c
@@ -51,266 +51,35 @@
  #include "storage/shmem.h"
  
  #include "utils/builtins.h"
+#include "utils/memutils.h"
  #include "utils/pg_lsn.h"
  #include "utils/syscache.h"
  
-char *bdr_temp_dump_directory = NULL;
-bool bdr_init_from_basedump = false;
-
-static void bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot);
-
-static void bdr_catchup_to_lsn(int cfg_index,
-                              XLogRecPtr target_lsn);
-
-/*
- * Search BdrWorkerCtl for a worker in dbname with init_replica set and
- * return it. The first worker found is returned (previous code should've
- * ensured there can only be one). If no match is found, return null.
- *
- * Must be called with at least a share lock on BdrWorkerCtl->lock
- *
- */
-static BdrWorker*
-find_init_replica_worker(Name dbname)
-{
-   int off;
-
-   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
-   /* Check whether one of our connections has init_replica set */
-   for (off = 0; off < bdr_max_workers; off++)
-   {
-       BdrApplyWorker         *aw;
-       BdrConnectionConfig    *cfg;
  
-       if (BdrWorkerCtl->slots[off].worker_type != BDR_WORKER_APPLY)
-           continue;
+char *bdr_temp_dump_directory = NULL;
  
-       aw = &BdrWorkerCtl->slots[off].data.apply;
-       cfg = bdr_connection_configs[aw->connection_config_idx];
+static void bdr_init_exec_dump_restore(BDRNodeInfo *node,
+                                      char *snapshot);
  
-       if ((strcmp(cfg->dbname, NameStr(*dbname)) == 0)
-           && cfg->init_replica)
-       {
-           return &BdrWorkerCtl->slots[off];
-       }
-   }
-   return NULL;
-}
+static void bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn);
  
  /*
- * Get this node's status value from the remote's bdr.bdr_nodes table
- * and return it.
+ * Make sure remote node has BDR activated (insert the security label).
   *
- * If no row is found, '\0' is returned.
+ * This is only needed for UDR.
   */
-static char
-bdr_get_remote_status(PGconn *pgconn)
+static void
+bdr_remote_activate(PGconn *pgconn)
  {
     PGresult           *res;
-   char                status;
-   Oid                 param_types[] = {TEXTOID, OIDOID, OIDOID};
-   const char         *param_values[3];
-   /* Needs to fit max length of UINT64_FORMAT */
-   char                sysid_str[33];
-   char                tlid_str[33];
-   char                mydatabaseid_str[33];
-
-   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
-            GetSystemIdentifier());
-   sysid_str[sizeof(sysid_str)-1] = '\0';
-
-   snprintf(tlid_str, sizeof(tlid_str), "%u",
-            ThisTimeLineID);
-   tlid_str[sizeof(tlid_str)-1] = '\0';
  
-   snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
-            MyDatabaseId);
-   mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
-   param_values[0] = sysid_str;
-   param_values[1] = tlid_str;
-   param_values[2] = mydatabaseid_str;
-
-   res = PQexecParams(pgconn,
-                      "SELECT node_status FROM bdr.bdr_nodes "
-                      "WHERE node_sysid = $1 AND node_timeline = $2 "
-                      "AND node_dboid = $3 "
-                      "FOR UPDATE",
-                      3, param_types, param_values, NULL, NULL, 0);
+   res = PQexec(pgconn, "SELECT bdr.internal_update_seclabel()");
     if (PQresultStatus(res) != PGRES_TUPLES_OK)
     {
-       elog(FATAL, "bdr: Failed to get remote status during bdr init: state %s: %s\n",
+       elog(FATAL, "bdr: Failed to activate remote node during bdr init: state %s: %s\n",
              PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
     }
-   if (PQntuples(res) == 0)
-       /* No row found on remote, we're starting from scratch */
-       status = '\0';
-   else
-   {
-       char *status_str = PQgetvalue(res, 0, 0);
-       Assert(strlen(status_str) == 1);
-       status = status_str[0];
-   }
     PQclear(res);
-
-   return status;
-}
-
-/*
- * Update/delete/insert in bdr.bdr_nodes to ensure that the bdr.bdr_nodes row
- * for this worker's node ID matches the passed status before returning.
- *
- * The special case '\0' means "remove the row".
- *
- * No fancy upsert games are required here because we ensure that only one
- * worker can be initing any one database, and that node IDs are unique across
- * a group of BDR nodes.
- */
-static char
-bdr_set_remote_status(PGconn *pgconn, const char status,
-                     const char prev_status)
-{
-   PGresult           *res;
-   char               *status_str;
-   const uint64        sysid = GetSystemIdentifier();
-   /* Needs to fit max length of UINT64_FORMAT */
-   char                sysid_str[33];
-   char                tlid_str[33];
-   char                mydatabaseid_str[33];
-
-   if (status == prev_status)
-       /* No action required (we could check the remote, but meh) */
-       return status;
-
-   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
-            GetSystemIdentifier());
-   sysid_str[sizeof(sysid_str)-1] = '\0';
-
-   snprintf(tlid_str, sizeof(tlid_str), "%u",
-            ThisTimeLineID);
-   tlid_str[sizeof(tlid_str)-1] = '\0';
-
-   snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
-            MyDatabaseId);
-   mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
-   if (status == '\0')
-   {
-       Oid         param_types[] = {TEXTOID, OIDOID, OIDOID};
-       const char *param_values[3];
-       char        new_status;
-
-       param_values[0] = sysid_str;
-       param_values[1] = tlid_str;
-       param_values[2] = mydatabaseid_str;
-
-       res = PQexecParams(pgconn,
-                          "DELETE FROM bdr.bdr_nodes WHERE node_sysid = $1"
-                          " AND node_timeline = $2 AND node_dboid = $3 "
-                          "RETURNING node_status",
-                          3, param_types, param_values, NULL, NULL, 0);
-
-       if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       {
-           elog(FATAL, "bdr: Failed to delete row from bdr_nodes: status %s: %s\n",
-                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
-       }
-       if (PQntuples(res) == 0)
-       {
-           /*
-            * If prev_status was '\0' we wouldn't be here, so we should've
-            * got a returned value.
-            */
-           elog(FATAL, "bdr: bdr.bdr_nodes row for sysid=" UINT64_FORMAT
-                       ", tlid=%u, dboid=%u missing, expected row with status=%c",
-                sysid, ThisTimeLineID, MyDatabaseId, (int)prev_status);
-       }
-       status_str = PQgetvalue(res, 0, 0);
-       Assert(strlen(status_str) == 1);
-       new_status = status_str[0];
-
-       if (new_status != prev_status)
-       {
-           elog(FATAL, "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
-                       ", timeline=%u, dboid=%u had status=%c, expected status=%c",
-                sysid, ThisTimeLineID, MyDatabaseId, (int) new_status,
-                (int) prev_status);
-       }
-
-       PQclear(res);
-   }
-   else
-   {
-       Oid         param_types[] = {CHAROID, TEXTOID, OIDOID, OIDOID};
-       const char *param_values[4];
-       char        new_status;
-       char        status_str[2];
-
-       snprintf(status_str, 2, "%c", (int)status);
-       param_values[0] = status_str;
-       param_values[1] = sysid_str;
-       param_values[2] = tlid_str;
-       param_values[3] = mydatabaseid_str;
-
-       res = PQexecParams(pgconn,
-                          "UPDATE bdr.bdr_nodes "
-                          "SET node_status = $1 "
-                          "WHERE node_sysid = $2 AND node_timeline = $3 "
-                          "AND node_dboid = $4 "
-                          "RETURNING ("
-                          "  SELECT node_status FROM bdr.bdr_nodes "
-                          "  WHERE node_sysid = $2 AND node_timeline = $3 "
-                          "  AND node_dboid = $4"
-                          ")",
-                          4, param_types, param_values, NULL, NULL, 0);
-
-       if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       {
-           elog(FATAL,
-                "bdr: Failed to update bdr.nodes row: status %s: %s\n",
-                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
-       }
-       if (PQntuples(res) != 0)
-       {
-           char *new_status_str;
-           /* Updated a row */
-           new_status_str = PQgetvalue(res, 0, 0);
-           Assert(strlen(status_str) == 1);
-           new_status = new_status_str[0];
-           if (new_status != prev_status)
-           {
-               elog(FATAL,
-                    "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
-                    ", timeline=%u, dboid=%u had status=%c, expected status=%c",
-                    sysid, ThisTimeLineID, MyDatabaseId, (int)new_status,
-                    (int)prev_status);
-           }
-
-           PQclear(res);
-       }
-       else
-       {
-           /* No rows affected, insert a new row instead. We re-use the previous
-            * query parameters. */
-           PQclear(res);
-           res = PQexecParams(pgconn,
-                              "INSERT INTO bdr.bdr_nodes"
-                              " (node_status, node_sysid, node_timeline, node_dboid)"
-                              " VALUES ($1, $2, $3, $4);",
-                              4, param_types, param_values, NULL, NULL, 0);
-
-           if (PQresultStatus(res) != PGRES_COMMAND_OK)
-           {
-               elog(FATAL,
-                    "bdr: Failed to insert row into bdr.bdr_nodes: status %s: %s\n",
-                    PQresStatus(PQresultStatus(res)),
-                    PQresultErrorMessage(res));
-           }
-           PQclear(res);
-       }
-   }
-
-   return status;
  }
  
  static XLogRecPtr
@@ -362,6 +131,8 @@ bdr_get_remote_ext_version(PGconn *pgconn, char **default_version,
     else if (PQntuples(res) == 0)
     {
         /* bdr ext is not known to Pg at all */
+       *default_version = NULL;
+       *installed_version = NULL;
     }
     else
     {
@@ -406,97 +177,6 @@ bdr_ensure_ext_installed(PGconn *pgconn)
     pfree(installed_version);
  }
  
-
-static void
-bdr_drop_slot_and_replication_identifier(BdrConnectionConfig *cfg)
-{
-
-   PGconn     *streamConn;
-   RepNodeId   replication_identifier;
-   NameData    slot_name;
-   TimeLineID  timeline;
-   Oid         dboid;
-   uint64      sysid;
-   PGresult   *res;
-   StringInfoData query;
-   char       *sqlstate;
-   NameData    appname;
-   char       *remote_ident;
-
-
-   elog(DEBUG1, "bdr %s: Dropping slot and local ident from connection %s",
-        cfg->dbname, cfg->name);
-
-   snprintf(NameStr(appname), NAMEDATALEN, "slot drop");
-   (NameStr(appname))[NAMEDATALEN-1] = '\0';
-
-   /* Establish BDR conn and IDENTIFY_SYSTEM */
-   streamConn = bdr_connect(
-       cfg->dsn, &appname,
-       &sysid, &timeline, &dboid
-       );
-
-   bdr_build_ident_and_slotname(sysid, timeline, dboid,
-           &remote_ident, &slot_name);
-
-
-   StartTransactionCommand();
-   replication_identifier = GetReplicationIdentifier(remote_ident, true);
-
-   pfree(remote_ident);
-
-   if (OidIsValid(replication_identifier))
-   {
-       /* Local replication identifier exists and must be dropped. */
-       elog(DEBUG2, "bdr %s: Deleting local replication identifier %hu",
-            cfg->dbname, replication_identifier);
-       DropReplicationIdentifier(replication_identifier);
-       /*
-        * We should CHECKPOINT after this to make sure replication
-        * identifier state gets flushed.
-        */
-       RequestCheckpoint(CHECKPOINT_IMMEDIATE|CHECKPOINT_FORCE);
-   }
-   else
-   {
-       elog(DEBUG2, "bdr %s: No local replication identifier to delete",
-            cfg->dbname);
-   }
-
-   /*
-    * Remove corresponding remote slot if it exists. We can't query
-    * whether it exists or not silently over the replication protocol,
-    * so we just try it and cope if it's missing.
-    */
-   initStringInfo(&query);
-   appendStringInfo(&query, "DROP_REPLICATION_SLOT %s", NameStr(slot_name));
-   res = PQexec(streamConn, query.data);
-   if (PQresultStatus(res) == PGRES_COMMAND_OK)
-   {
-       elog(DEBUG2, "bdr %s: remote replication slot %s deleted",
-            cfg->dbname, NameStr(slot_name));
-   }
-   else
-   {
-       /* SQLSTATE 42704 expected; others are error conditions */
-       sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
-       if (strcmp(sqlstate, "42704") != 0)
-       {
-           ereport(ERROR,
-                   (errmsg("'DROP_REPLICATION_SLOT %s' on bdr connection %s failed with sqlstate %s: %s",
-                           NameStr(slot_name), cfg->name,
-                           sqlstate,PQresultErrorMessage(res))));
-       }
-       else
-       {
-           elog(DEBUG2, "bdr %s: No slot to delete", cfg->dbname);
-       }
-   }
-   CommitTransactionCommand();
-   PQclear(res);
-   PQfinish(streamConn);
-}
-
  static void
  bdr_init_replica_cleanup_tmpdir(int errcode, Datum tmpdir)
  {
@@ -510,11 +190,14 @@ bdr_init_replica_cleanup_tmpdir(int errcode, Datum tmpdir)
  
  /*
   * Use a script to copy the contents of a remote node using pg_dump and apply
- * it to the local node. Runs during slot creation to bring up a new logical
- * replica from an existing node.
+ * it to the local node. Runs during node join creation to bring up a new
+ * logical replica from an existing node. The remote dump is taken from the
+ * start position of a slot on the remote end to ensure that we never replay
+ * changes included in the dump and never miss changes.
   */
  static void
-bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
+bdr_init_exec_dump_restore(BDRNodeInfo *node,
+                          char *snapshot)
  {
  #ifndef WIN32
     pid_t pid;
@@ -564,22 +247,21 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
  
  
     appendStringInfo(&origin_dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica dump'",
-                    cfg->dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica dump'",
+                    node->init_from_dsn, BDR_LOCALID_FORMAT_ARGS);
  
-   if (cfg->replica_local_dsn == NULL)
-       elog(FATAL, "bdr init_replica: no replica_local_dsn specified");
     appendStringInfo(&local_dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica restore'",
-                    cfg->replica_local_dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica restore'",
+                    node->local_dsn, BDR_LOCALID_FORMAT_ARGS);
  
     /*
      * Suppress replication of changes applied via pg_restore back to
      * the local node.
      *
-    * XXX DYNCONF: This should PQconninfoParse, modify the options keyword or
-    * add it, and reconstruct the string using the functions from pg_dumpall
-    * (also to be used for init_copy). This is a hack.
+    * TODO: This should PQconninfoParse, modify the options keyword or add
+    * it, and reconstruct the string using the functions from pg_dumpall
+    * (also to be used for init_copy). Simply appending the options
+    * instead is a bit dodgy.
      */
     appendStringInfoString(&local_dsn,
                            " options='-c bdr.do_not_replicate=on -c bdr.permit_unsafe_ddl_commands=on -c bdr.skip_ddl_replication=on -c bdr.skip_ddl_locking=on'");
@@ -628,8 +310,8 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
  
         ereport(LOG,
                 (errmsg("Creating replica with: %s --snapshot %s --source \"%s\" --target \"%s\" --tmp-directory \"%s\", --pg-dump-path \"%s\", --pg-restore-path \"%s\"",
-                       bdr_init_replica_script_path, snapshot, cfg->dsn,
-                       cfg->replica_local_dsn, tmpdir,
+                       bdr_init_replica_script_path, snapshot,
+                       node->init_from_dsn, node->local_dsn, tmpdir,
                         bdr_dump_path, bdr_restore_path)));
  
         n = execv(bdr_init_replica_script_path, argv);
@@ -699,376 +381,717 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
  #endif
  }
  
+/*
+ * BDR state synchronization.
+ */
  static void
-bdr_init_replica_conn_close(int code, Datum connptr)
+bdr_sync_nodes(PGconn *remote_conn, BDRNodeInfo *local_node)
  {
-   PGconn **conn_p;
-   PGconn *conn;
+   PGconn *local_conn;
  
-   conn_p = (PGconn**) DatumGetPointer(connptr);
-   Assert(conn_p != NULL);
-   conn = *conn_p;
+   local_conn = bdr_connect_nonrepl(local_node->local_dsn, "init");
  
-   if (conn == NULL)
-       return;
-   if (PQstatus(conn) != CONNECTION_OK)
-       return;
-   PQfinish(conn);
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+   {
+       StringInfoData query;
+       PGresult   *res;
+       char        sysid_str[33];
+       const char *const setup_query =
+           "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+           "SET LOCAL search_path = bdr, pg_catalog;\n"
+           "SET LOCAL bdr.permit_unsafe_ddl_commands = on;\n"
+           "SET LOCAL bdr.skip_ddl_replication = on;\n"
+           "SET LOCAL bdr.skip_ddl_locking = on;\n"
+           "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+           "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+       /* Setup the environment. */
+       res = PQexec(remote_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       res = PQexec(local_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on local failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       /* Copy remote bdr_nodes entries to the local node. */
+       bdr_copytable(remote_conn, local_conn,
+                     "COPY (SELECT * FROM bdr.bdr_nodes) TO stdout",
+                     "COPY bdr.bdr_nodes FROM stdin");
+
+       /* Copy the local entry to remote node. */
+       initStringInfo(&query);
+       /* No need to quote as everything is numbers. */
+       snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, local_node->sysid);
+       sysid_str[sizeof(sysid_str)-1] = '\0';
+       appendStringInfo(&query,
+                        "COPY (SELECT * FROM bdr.bdr_nodes WHERE "
+                           "node_sysid = '%s' AND node_timeline = '%u' "
+                           "AND node_dboid = '%u') TO stdout",
+                        sysid_str, local_node->timeline, local_node->dboid);
+
+       bdr_copytable(local_conn, remote_conn,
+                     query.data, "COPY bdr.bdr_nodes FROM stdin");
+
+       /*
+        * Copy remote connections to the local node.
+        *
+        * Adding local connection to remote node is handled separately
+        * because it triggers the connect-back process on the remote node(s).
+        */
+       bdr_copytable(remote_conn, local_conn,
+                     "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+                     "COPY bdr.bdr_connections FROM stdin");
+
+       /* Save changes. */
+       res = PQexec(remote_conn, "COMMIT");
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "COMMIT on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       res = PQexec(local_conn, "COMMIT");
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "COMMIT on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+   }
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&local_conn));
+   PQfinish(local_conn);
+}
+
+static void
+bdr_insert_remote_conninfo(PGconn *conn, BdrConnectionConfig *myconfig)
+{
+#define INTERNAL_NODE_JOIN_NPARAMS 6
+   PGresult   *res;
+   Oid         types[INTERNAL_NODE_JOIN_NPARAMS] = { TEXTOID, OIDOID, OIDOID, TEXTOID, INT4OID, TEXTARRAYOID };
+   const char *values[INTERNAL_NODE_JOIN_NPARAMS];
+   StringInfoData      replicationsets;
+
+   /* Needs to fit max length of UINT64_FORMAT */
+   char                sysid_str[33];
+   char                tlid_str[33];
+   char                mydatabaseid_str[33];
+   char                apply_delay[33];
+
+   initStringInfo(&replicationsets);
+
+   stringify_my_node_identity(sysid_str, sizeof(sysid_str),
+                              tlid_str, sizeof(tlid_str),
+                              mydatabaseid_str, sizeof(mydatabaseid_str));
+
+   values[0] = &sysid_str[0];
+   values[1] = &tlid_str[0];
+   values[2] = &mydatabaseid_str[0];
+   values[3] = myconfig->dsn;
+
+   snprintf(&apply_delay[0], 33, "%d", myconfig->apply_delay);
+   values[4] = &apply_delay[0];
+   /*
+    * Replication sets are stored as a quoted identifier list. To turn
+    * it into an array literal we can just wrap some brackets around it.
+    */
+   appendStringInfo(&replicationsets, "{%s}", myconfig->replication_sets);
+   values[5] = replicationsets.data;
+
+   res = PQexecParams(conn,
+                      "SELECT bdr.internal_node_join($1,$2,$3,$4,$5,$6);",
+                      INTERNAL_NODE_JOIN_NPARAMS,
+                      types, &values[0], NULL, NULL, 0);
+
+   /*
+    * bdr.internal_node_join() must correctly handle unique violations.
+    * Otherwise init that resumes after slot creation, when we're waiting
+    * for inbound slots, will fail.
+    */
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       elog(ERROR, "unable to update remote bdr.bdr_connections: %s",
+                   PQerrorMessage(conn));
+
+#undef INTERNAL_NODE_JOIN_NPARAMS
  }
  
  /*
- * Determine whether we need to initialize the database from a remote
- * node and perform the required initialization if so.
+ * Find all connections other than our own using the copy of
+ * bdr.bdr_connections that we acquired from the remote server during
+ * apply. Apply workers won't be started yet, we're just making the
+ * slots.
+ *
+ * If the slot already exists from a prior attempt we'll leave it
+ * alone. It'll be advanced when we start replaying from it anyway,
+ * and it's guaranteed to retain more than the WAL we need.
   */
-void
-bdr_init_replica(Name dbname)
+static void
+bdr_init_make_other_slots()
  {
-   char status;
-   XLogRecPtr min_remote_lsn;
-   PGconn *nonrepl_init_conn;
-   StringInfoData dsn;
-   BdrWorker  *init_replica_worker;
-   BdrConnectionConfig *init_replica_config;
-   int spi_ret;
+   List       *configs;
+   ListCell   *lc;
+   MemoryContext old_context;
  
-   initStringInfo(&dsn);
+   Assert(!IsTransactionState());
+   StartTransactionCommand();
+   old_context = MemoryContextSwitchTo(TopMemoryContext);
+   configs = bdr_read_connection_configs();
+   MemoryContextSwitchTo(old_context);
+   CommitTransactionCommand();
+
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = lfirst(lc);
+       PGconn *conn;
+       NameData slot_name;
+       uint64 sysid;
+       TimeLineID timeline;
+       Oid dboid;
+       RepNodeId replication_identifier;
+       char *snapshot;
+
+       if (cfg->sysid == GetSystemIdentifier() &&
+           cfg->timeline == ThisTimeLineID &&
+           cfg->dboid == MyDatabaseId)
+       {
+           /* Don't make a slot pointing to ourselves */
+           continue;
+           bdr_free_connection_config(cfg);
+       }
+
+       conn = bdr_establish_connection_and_slot(cfg->dsn, "mkslot", &slot_name,
+               &sysid, &timeline, &dboid, &replication_identifier,
+               &snapshot);
+
+       /* Ensure the slot points to the node the conn info says it should */
+       if (cfg->sysid != sysid ||
+           cfg->timeline != timeline ||
+           cfg->dboid != dboid)
+       {
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("System identification mismatch between connection and slot"),
+                    errdetail("Connection for "BDR_LOCALID_FORMAT" resulted in slot on node "BDR_LOCALID_FORMAT" instead of expected node",
+                              cfg->sysid, cfg->timeline, cfg->dboid, EMPTY_REPLICATION_NAME,
+                              sysid, timeline, dboid, EMPTY_REPLICATION_NAME)));
+       }
+
+       /* We don't require the snapshot IDs here */
+       if (snapshot != NULL)
+           pfree(snapshot);
+
+       /* No replication for now, just close the connection */
+       PQfinish(conn);
  
-   elog(DEBUG2, "bdr %s: bdr_init_replica",
-        NameStr(*dbname));
+       elog(DEBUG2, "Ensured existence of slot %s on "BDR_LOCALID_FORMAT,
+                    NameStr(slot_name), cfg->sysid, cfg->timeline, cfg->dboid,
+                    EMPTY_REPLICATION_NAME);
+
+       bdr_free_connection_config(cfg);
+   }
+
+   list_free(configs);
+}
+
+/*
+ * For each outbound connection in bdr.bdr_connections we should have a local
+ * replication slot created by a remote node using our connection info.
+ *
+ * Wait until all such entries are created and active, then return.
+ */
+static void
+bdr_init_wait_for_slot_creation()
+{
+   List       *configs;
+   ListCell   *lc;
+   Name*       slot_names;
+   Size        n_slots;
+   int         tup_idx, arr_idx;
+
+   elog(INFO, "waiting for all inbound slots to be established");
  
     /*
-    * The local SPI transaction we're about to perform must do any writes as a
-    * local transaction, not as a changeset application from a remote node.
-    * That allows rows to be repliated to other nodes. So no replication_origin_id
-    * may be set.
+    * Determine the list of expected slot identifiers. These are
+    * inbound slots, so they're our db oid + the remote's bdr ident.
      */
-   Assert(replication_origin_id == InvalidRepNodeId);
+   StartTransactionCommand();
+   configs = bdr_read_connection_configs();
+
+   slot_names = (Name*)palloc0(sizeof(Name) * list_length(configs));
+
+   n_slots = 0;
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = lfirst(lc);
+       Name slot_name;
+
+       if (cfg->sysid == GetSystemIdentifier() &&
+           cfg->timeline == ThisTimeLineID &&
+           cfg->dboid == MyDatabaseId)
+       {
+           /* We won't see an inbound slot from our own node */
+           continue;
+       }
+
+       /* There's no corresponding incoming slot for a unidirectional slot */
+       if (cfg->is_unidirectional)
+           continue;
+
+       slot_name = (NameData*) palloc0(sizeof(NameData));
+       bdr_slot_name(slot_name, cfg->sysid, cfg->timeline, cfg->dboid,
+                     MyDatabaseId);
+
+       elog(DEBUG2, "expecting inbound slot named %s", NameStr(*slot_name));
+
+       slot_names[n_slots++] = slot_name;
+   }
  
     /*
-    * Check the local bdr.bdr_nodes over SPI or direct scan to see if
-    * there's an entry for ourselves in ready mode already.
+    * Wait for each to be created. There's no useful way to be notified when a
+    * slot gets created, so just scan all slots to see if all the ones we want
+    * are present and active. If not, sleep and retry soon.
      *
-    * Note that we don't have to explicitly SPI_finish(...) on error paths;
-    * that's taken care of for us.
+    * This is a very inefficient approach but for the number of slots we're
+    * interested in it doesn't matter.
      */
-   StartTransactionCommand();
-   spi_ret = SPI_connect();
-   if (spi_ret != SPI_OK_CONNECT)
-       elog(ERROR, "SPI already connected; this shouldn't be possible");
+   SPI_connect();
  
-   status = bdr_nodes_get_local_status(GetSystemIdentifier(), ThisTimeLineID,
-                                       MyDatabaseId);
-   if (status == 'r')
+   while (true)
     {
-       /* Already in ready state, nothing more to do */
-       elog(DEBUG2, "init_replica: Already inited");
-       SPI_finish();
-       CommitTransactionCommand();
-       return;
+       Datum   values[1] = {MyDatabaseId};
+       Oid     types[1] = {OIDOID};
+       Size    n_slots_found = 0;
+
+       SPI_execute_with_args("select slot_name "
+                             "from pg_catalog.pg_replication_slots "
+                             "where plugin = '"BDR_LIBRARY_NAME"' "
+                             "and slot_type = 'logical' "
+                             "and datoid = $1 and active",
+                             1, types, values, NULL, false, 0);
+
+       for (tup_idx = 0; tup_idx < SPI_processed; tup_idx++)
+       {
+           char       *slot_name;
+
+           slot_name = SPI_getvalue(SPI_tuptable->vals[tup_idx],
+                                    SPI_tuptable->tupdesc,
+                                    1);
+
+           Assert(slot_name != NULL);
+
+           /*
+            * Does this slot appear in the array of expected slots and if so,
+            * have we seen it already?
+            *
+            * This is O(m*n) for m existing slots and n expected slots, but
+            * really, for this many slots, who cares.
+            */
+           for (arr_idx = 0; arr_idx < n_slots; arr_idx++)
+           {
+               if ( strcmp(NameStr(*slot_names[arr_idx]), slot_name) == 0 )
+               {
+                   n_slots_found++;
+                   break;
+               }
+           }
+       }
+
+       if (n_slots_found == n_slots)
+           break;
+
+       elog(DEBUG2, "found %u of %u expected slots, sleeping",
+            (uint32)n_slots_found, (uint32)n_slots);
+
+       pg_usleep(100000);
     }
  
+   SPI_finish();
+
+   CommitTransactionCommand();
+
+   elog(INFO, "all inbound slots established");
+
     /*
-    * Before starting workers we must determine if we need to copy
-    * initial state from a remote node. This is only necessary if
-    * there is a connection with init_replica set and we do not yet
-    * have an entry in the local "bdr.bdr_nodes" table for our node
-    * ID showing initialisation to be complete.
+    * Should this also check all outbound workers are connected? Doing so
+    * isn't simple - checking for replication identifiers doesn't confirm that
+    * the connection is active. We'd need to talk to the apply workers or try
+    * to convey information via pg_stat_activity.
      */
-   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-   init_replica_worker = find_init_replica_worker(dbname);
-   LWLockRelease(BdrWorkerCtl->lock);
-   if (!init_replica_worker)
+}
+
+/*
+ * TODO DYNCONF perform_pointless_transaction
+ *
+ * This is temporary code to be removed when the full part/join protocol is
+ * introduced, at which point WAL messages should handle this. See comments on
+ * call site.
+ */
+static void
+perform_pointless_transaction(PGconn *conn, BDRNodeInfo *node)
+{
+   PGresult   *res;
+
+   res = PQexec(conn, "CREATE TEMP TABLE bdr_init(a int) ON COMMIT DROP");
+   Assert(PQresultStatus(res) == PGRES_COMMAND_OK);
+   PQclear(res);
+}
+
+/*
+ * Initialize the database, from a remote node if necessary.
+ */
+void
+bdr_init_replica(BDRNodeInfo *local_node)
+{
+   char                status;
+   PGconn             *nonrepl_init_conn;
+   StringInfoData      dsn;
+   BdrConnectionConfig *local_conn_config;
+
+   initStringInfo(&dsn);
+
+   status = local_node->status;
+
+   Assert(status != 'r');
+
+   elog(DEBUG2, "bdr_init_replica");
+
+   /*
+    * The local SPI transaction we're about to perform must do any writes as a
+    * local transaction, not as a changeset application from a remote node.
+    * That allows rows to be replicated to other nodes. So no replication_origin_id
+    * may be set.
+    */
+   Assert(replication_origin_id == InvalidRepNodeId);
+
+   /*
+    * Before starting workers we must determine if we need to copy initial
+    * state from a remote node. This is necessary unless we are the first node
+    * created or we've already completed init. If we'd already completed init
+    * we would've exited above.
+    */
+   if (local_node->init_from_dsn == NULL)
     {
-       if (status != '\0')
+       if (status != 'b')
         {
             /*
              * Even though there's no init_replica worker, the local bdr.bdr_nodes table
              * has an entry for our (sysid,dbname) and it isn't status=r (checked above),
-            * we must've had an init_replica configured before, then removed.
+            * this should never happen
              */
-           ereport(ERROR, (errmsg("bdr.bdr_nodes row with (sysid="
-                   UINT64_FORMAT ", dbname=%s) exists and has status=%c, but "
-                   "no connection with init_replica=t is configured for this "
-                   "database. ",
-                   GetSystemIdentifier(), NameStr(*dbname), status),
-                   errdetail("You probably configured initial setup with "
-                   "init_replica on a connection, then removed or changed that "
-                   "connection before setup completed properly. "),
-                   errhint("DROP and re-create the database if it has no "
-                   "existing content of value, or add the init_replica setting "
-                   "to one of the connections.")));
+           ereport(ERROR, (errmsg("bdr.bdr_nodes row with "BDR_LOCALID_FORMAT" exists and has status=%c, "
+                                  "but has init_from_dsn set to NULL",
+                   GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, EMPTY_REPLICATION_NAME, status)));
         }
         /*
          * No connections have init_replica=t, so there's no remote copy to do.
          * We still have to ensure that bdr.bdr_nodes.status is 'r' for this
          * node so that slot creation is permitted.
+        *
+        * XXX: is this actually a good idea?
          */
         elog(DEBUG2, "init_replica: Marking as root/standalone node");
         bdr_nodes_set_local_status('r');
-   }
-   /*
-    * We no longer require the transaction for SPI; further work gets done on
-    * the remote machine's bdr.bdr_nodes table and replicated back to us via
-    * pg_dump/pg_restore, or over the walsender protocol once we start
-    * replay. If we aren't just about to exit anyway.
-    */
-   SPI_finish();
-   CommitTransactionCommand();
  
-   if (!init_replica_worker)
-       /* Cleanup done and nothing more to do */
         return;
+   }
  
-   init_replica_config = bdr_connection_configs
-       [init_replica_worker->data.apply.connection_config_idx];
-   elog(LOG, "bdr %s: bdr_init_replica init from connection %s",
-        NameStr(*dbname), init_replica_config->name);
+   local_conn_config = bdr_get_connection_config(
+           local_node->sysid,
+           local_node->timeline,
+           local_node->dboid,
+           true);
  
-   resetStringInfo(&dsn);
-   appendStringInfo(&dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica setup'",
-                    init_replica_config->dsn, BDR_LOCALID_FORMAT_ARGS,
-                    init_replica_config->name);
+   elog(DEBUG1, "init_replica init from remote %s",
+        local_node->init_from_dsn);
  
-   /*
-    * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
-    * system identifier. If there is, that'll tell us what stage of startup
-    * we are up to and let us resume an incomplete start.
-    */
-   nonrepl_init_conn = PQconnectdb(dsn.data);
-   if (PQstatus(nonrepl_init_conn) != CONNECTION_OK)
-   {
-       ereport(FATAL,
-               (errmsg("bdr %s: could not connect to the upstream server in non-replication mode: %s",
-                       NameStr(*dbname),
-                       PQerrorMessage(nonrepl_init_conn))));
-   }
+   nonrepl_init_conn =
+       bdr_connect_nonrepl(local_node->init_from_dsn, "init");
  
-   PG_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
-           PointerGetDatum(&nonrepl_init_conn));
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&nonrepl_init_conn));
     {
         bdr_ensure_ext_installed(nonrepl_init_conn);
  
-       /* Get the bdr.bdr_nodes status field for our node id from the remote */
-       status = bdr_get_remote_status(nonrepl_init_conn);
-
-       if (bdr_init_from_basedump)
-       {
-           status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
-       }
-       else
+       switch (status)
         {
-           switch (status)
-           {
-               case '\0':
-                   elog(DEBUG2, "bdr %s: initializing from clean state",
-                        NameStr(*dbname));
-                   break;
+           case 'b':
+               elog(DEBUG2, "initializing from clean state");
+               break;
  
-               case 'r':
-                   /*
-                    * Init has been completed, but we didn't check our local
-                    * bdr.bdr_nodes, or the final update hasn't propagated yet.
-                    *
-                    * All we need to do is catch up, we already replayed enough to be
-                    * consistent and start up in normal mode last time around
-                    */
-                   elog(DEBUG2, "bdr %s: init already completed, nothing to do",
-                        NameStr(*dbname));
-                   return;
-
-               case 'c':
-                   /*
-                    * We were in catchup mode when we died. We need to resume catchup
-                    * mode up to the expected LSN before switching over.
-                    *
-                    * To do that all we need to do is fall through without doing any
-                    * slot re-creation, dump/apply, etc, and pick up when we do
-                    * catchup.
-                    *
-                    * We won't know what the original catchup target point is, but we
-                    * can just catch up to whatever xlog position the server is
-                    * currently at.
-                    */
-                   elog(DEBUG2, "bdr %s: dump applied, need to continue catchup",
-                        NameStr(*dbname));
-                   break;
+           case 'r':
+               elog(ERROR, "unexpected state");
  
-               case 'i':
-                   /*
-                    * A previous init attempt seems to have failed. Clean up, then
-                    * fall through to start setup again.
-                    *
-                    * We can't just re-use the slot and replication identifier that
-                    * were created last time (if they were), because we have no way
-                    * of getting the slot's exported snapshot after
-                    * CREATE_REPLICATION_SLOT.
-                    */
-                   elog(DEBUG2, "bdr %s: previous failed initalization detected, cleaning up",
-                        NameStr(*dbname));
-                   bdr_drop_slot_and_replication_identifier(init_replica_config);
-                   status = bdr_set_remote_status(nonrepl_init_conn, '\0', status);
-                   break;
+           case 'c':
+               /*
+                * We were in catchup mode when we died. We need to resume catchup
+                * mode up to the expected LSN before switching over.
+                *
+                * To do that all we need to do is fall through without doing any
+                * slot re-creation, dump/apply, etc, and pick up where we do
+                * catchup.
+                *
+                * We won't know what the original catchup target point is, but we
+                * can just catch up to whatever xlog position the server is
+                * currently at, it's guaranteed to be later than the target
+                * position.
+                */
+               elog(DEBUG2, "dump applied, need to continue catchup");
+               break;
  
-               default:
-                   elog(ERROR, "unreachable"); /* Unhandled case */
-                   break;
-           }
+           case 'o':
+               elog(DEBUG2, "dump applied and catchup completed, need to continue slot creation");
+               break;
+
+           case 'i':
+               /*
+                * A previous init attempt seems to have failed.
+                * Clean up, then fall through to start setup
+                * again.
+                *
+                * We can't just re-use the slot and replication
+                * identifier that were created last time (if
+                * they were), because we have no way of getting
+                * the slot's exported snapshot after
+                * CREATE_REPLICATION_SLOT.
+                *
+                * We could drop and re-create the slot, but...
+                *
+                * We also have no way to undo a failed
+                * pg_restore, so if that phase fails it's
+                * necessary to do manual cleanup, dropping and
+                * re-creating the db.
+                *
+                * To avoid that We need to be able to run
+                * pg_restore --clean, and that needs a way to
+                * exclude the bdr schema, the bdr extension,
+                * and their dependencies like plpgsql and
+                * btree_gist. (TODO patch pg_restore for that)
+                */
+               ereport(ERROR,
+                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                        errmsg("previous init failed, manual cleanup is required"),
+                        errdetail("Found bdr.bdr_nodes entry for "BDR_LOCALID_FORMAT" with state=i in remote bdr.bdr_nodes", BDR_LOCALID_FORMAT_ARGS),
+                        errhint("Remove all replication identifiers and slots corresponding to this node from the init target node then drop and recreate this database and try again")));
+               break;
+
+           default:
+               elog(ERROR, "unreachable %c", status); /* Unhandled case */
+               break;
         }
  
-       if (status == '\0')
+       if (status == 'b')
         {
-           int         off;
-           int        *my_conn_idxs;
-           int         n_conns = 0;
             char       *init_snapshot = NULL;
             PGconn     *init_repl_conn = NULL;
+           NameData    slot_name;
+           uint64      remote_sysid;
+           TimeLineID  remote_timeline;
+           Oid         remote_dboid;
+           RepNodeId   repnodeid;
  
-           elog(LOG, "bdr %s: initializing from remote db", NameStr(*dbname));
+           elog(INFO, "initializing node");
  
             /*
              * We're starting from scratch or have cleaned up a previous failed
              * attempt.
              */
-           status = bdr_set_remote_status(nonrepl_init_conn, 'i', status);
+           status = 'i';
+           bdr_nodes_set_local_status(status);
  
             /*
-            * A list of all connections to make slots for, as indexes into
-            * BdrWorkerCtl.
+            * This is unidirectional subscribe, let the other node know that
+            * it should behave as BDR node (as it might be UDR node which does
+            * not require init).
              */
-           my_conn_idxs = (int*)palloc(sizeof(Size) * bdr_max_workers);
+           if (local_conn_config == NULL)
+               bdr_remote_activate(nonrepl_init_conn);
  
-           /* Collect a list of connections to make slots for. */
-           LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-           for (off = 0; off < bdr_max_workers; off++)
-           {
-               BdrWorker              *worker = &BdrWorkerCtl->slots[off];
-
-               if (worker->worker_type == BDR_WORKER_APPLY)
-               {
-                   BdrConnectionConfig * const cfg = bdr_connection_configs
-                       [worker->data.apply.connection_config_idx];
+           /*
+            * Now establish our slot on the target node, so we can replay
+            * changes from that node. It'll be used in catchup mode.
+            */
+           init_repl_conn = bdr_establish_connection_and_slot(
+                               local_node->init_from_dsn,
+                               "init", &slot_name,
+                               &remote_sysid, &remote_timeline, &remote_dboid,
+                               &repnodeid, &init_snapshot);
  
-                   if (strcmp(cfg->dbname, NameStr(*dbname)) == 0)
-                       my_conn_idxs[n_conns++] = off;
-               }
-           }
-           LWLockRelease(BdrWorkerCtl->lock);
+           elog(INFO, "connected to target node "BDR_LOCALID_FORMAT
+                " with snapshot %s",
+                remote_sysid, remote_timeline, remote_dboid,
+                EMPTY_REPLICATION_NAME, init_snapshot);
  
-           elog(DEBUG2, "bdr %s: creating slots for %d nodes",
-                NameStr(*dbname), n_conns);
+           /*
+            * Take the remote dump and apply it. This will give us a local
+            * copy of bdr_connections to work from. It's guaranteed that
+            * everything after this dump will be accessible via the catchup
+            * mode slot created earlier.
+            */
+           bdr_init_exec_dump_restore(local_node, init_snapshot);
  
             /*
-            * For each connection, ensure its slot exists.
+            * TODO DYNCONF copy replication identifier state
+            *
+            * Should copy the target node's pg_catalog.pg_replication_identifier
+            * state for each node to the local node, using the same snapshot
+            * we used to take the dump from the remote. Doing this ensures
+            * that when we create slots to the target nodes they'll begin
+            * replay from a position that's exactly consistent with what's
+            * in the dump.
              *
-            * Do it one by one rather than fiddling with async libpq queries. If
-            * this needs to be parallelized later, it should probably be done by
-            * launching each apply worker and letting them create their own
-            * slots, then having them wait until signalled/unlatched before
-            * proceeding with actual replication. That'll save us another round
-            * of connections too.
+            * We'll still need catchup mode because there's no guarantee our
+            * newly created slots will force all WAL we'd need to be retained
+            * on each node. The target might be behind. So we should catchup
+            * replay until the replication identifier positions received from
+            * catchup are >= the creation positions of the slots we made.
              *
-            * We don't attempt any cleanup if slot creation fails, we just bail out
-            * and leave any already-created slots in place.
+            * (We don't need to do this if we instead send a replay confirmation
+            * request and wait for a reply from each node.)
              */
-           for (off = 0; off < n_conns; off++)
-           {
-               BdrWorker *w = &BdrWorkerCtl->slots[my_conn_idxs[off]];
-               BdrConnectionConfig *cfg;
-               char *snapshot = NULL;
-               PGconn *conn = NULL;
-               RepNodeId replication_identifier;
-               NameData slot_name;
-               uint64 sysid;
-               Oid dboid;
-               TimeLineID timeline;
-
-               cfg = bdr_connection_configs
-                   [w->data.apply.connection_config_idx];
-
-               ereport(LOG,
-                       (errmsg("bdr %s: checking/creating slot for %s at %s",
-                               NameStr(*dbname), cfg->name, cfg->dsn)));
-               /*
-                * Create the slot on the remote. The returned remote sysid and
-                * timeline, the slot name, and the local replication identifier
-                * are all discarded; they're not needed here, and will be obtained
-                * again by the apply workers when they're launched after init.
-                */
-               conn = bdr_establish_connection_and_slot(cfg->dsn, "slot",
-                   &slot_name, &sysid, &timeline, &dboid, &replication_identifier,
-                   &snapshot);
  
-               /* Always throws rather than returning failure */
-               Assert(conn);
+           PQfinish(init_repl_conn);
+           pfree(init_snapshot);
  
-               if (w == init_replica_worker)
-               {
-                   /*
-                    * We need to keep the snapshot ID returned by CREATE SLOT so
-                    * we can pass it to pg_dump to get a consistent dump from the
-                    * remote slot's start point.
-                    *
-                    * The snapshot is only valid for the lifetime of the
-                    * replication connection we created it with, so we must keep
-                    * that connection around until the dump finishes.
-                    */
-                   if (!snapshot)
-                       elog(ERROR, "bdr %s: init_replica failed to create snapshot!",
-                            NameStr(*dbname));
-                   init_snapshot = snapshot;
-                   init_repl_conn = conn;
-               }
-               else
-               {
-                   /*
-                    * Just throw the returned info away; we only needed to create
-                    * the slot so its replication identifier can be advanced
-                    * during catchup.
-                    */
-                   if (snapshot)
-                       pfree(snapshot);
-                   PQfinish(conn);
-               }
+           /*
+            * This is group join, copy the state (bdr_nodes and
+            * bdr_connections) over from the init node to our node.
+            */
+           if (local_conn_config != NULL)
+           {
+               elog(DEBUG1, "syncing bdr_nodes and bdr_connections");
+               bdr_sync_nodes(nonrepl_init_conn, local_node);
             }
  
-           pfree(my_conn_idxs);
+           status = 'c';
+           bdr_nodes_set_local_status(status);
+           elog(DEBUG1, "dump and apply finished, preparing for catchup replay");
+       }
+
+       Assert(status != 'b');
+
+       if (status == 'c')
+       {
+           XLogRecPtr min_remote_lsn;
+           remote_node_info ri;
+
+           /*
+            * Launch outbound connections to all other nodes. It doesn't
+            * matter that their slot horizons are after the dump was taken on
+            * the origin node, so we could never replay all the data we need
+            * if we switched to replaying from these slots now.  We'll be
+            * advancing them in catchup mode until they overtake their current
+            * position before switching to replaying from them directly.
+            */
+           bdr_init_make_other_slots();
  
-           /* If we get here, we should have a valid snapshot to dump */
-           Assert(init_snapshot != NULL);
-           Assert(init_repl_conn != NULL);
+           /*
+            * Enter catchup mode and wait until we've replayed up to the LSN
+            * the remote was at when we started catchup.
+            *
+            * TODO: It's possible that this step can lose transactions that
+            * were committed on a 3rd party node before we made our slot on it
+            * but not replicated to the init target node until after we exit
+            * catchup mode. If we acquire the DDL lock during join we can know
+            * that can't happen, so we should do that.
+            */
+           elog(DEBUG3, "getting LSN to replay to in catchup mode");
+           min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
  
             /*
-            * Execute the dump and apply its self.
+            * Catchup cannot complete if there isn't at least one remote transaction
+            * to replay. So we perform a dummy transaction on the target node.
              *
-            * Note that the bdr extension tables override pg_dump's default and
-            * ask to be included in dumps. In particular, bdr.bdr_nodes will get
-            * copied over.
+            * XXX This is a hack. What we really *should* be doing is asking
+            * the target node to send a catchup confirmation wal message, then
+            * wait until all its current peers (we aren' one yet) reply with
+            * confirmation. Then we should be replaying until we get
+            * confirmation of this from the init target node, rather than
+            * replaying to some specific LSN. The full part/join
+            * protocol should take care of this.
              */
-           elog(DEBUG1, "bdr %s: creating and restoring dump for %s",
-                NameStr(*dbname), init_replica_config->name);
-           bdr_exec_init_replica(init_replica_config, init_snapshot);
-           PQfinish(init_repl_conn);
+           elog(DEBUG3, "forcing a new transaction on the target node");
+           perform_pointless_transaction(nonrepl_init_conn, local_node);
  
-           pfree(init_snapshot);
-           status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
+           bdr_get_remote_nodeinfo_internal(nonrepl_init_conn, &ri);
+
+           /* Launch the catchup worker and wait for it to finish */
+           elog(DEBUG1, "launching catchup mode apply worker");
+           bdr_catchup_to_lsn(&ri, min_remote_lsn);
+
+           free_remote_node_info(&ri);
+
+           /*
+            * We're done with catchup. The next phase is inserting our
+            * conninfo, so set status=o
+            */
+           status = 'o';
+           bdr_nodes_set_local_status(status);
+           elog(DEBUG1, "catchup worker finished, requesting slot creation");
         }
  
-       Assert(status == 'c');
+       /* To reach here we must be waiting for slot creation */
+       Assert(status == 'o');
+
+       /*
+        * It is now safe to start apply workers, as we've finished catchup.
+        * Doing so ensures that we will replay our own bdr.bdr_nodes changes
+        * from the target node and also makes sure we stay more up-to-date,
+        * reducing slot lag on other nodes.
+        */
+       bdr_launch_apply_workers(MyDatabaseId);
  
-       /* Launch the catchup worker and wait for it to finish */
-       elog(DEBUG1, "bdr %s: launching catchup mode apply worker", NameStr(*dbname));
-       min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
-       bdr_catchup_to_lsn(
-           init_replica_worker->data.apply.connection_config_idx,
-           min_remote_lsn);
-       status = bdr_set_remote_status(nonrepl_init_conn, 'r', status);
+       /*
+        * Insert our connection info on the remote end. This will prompt
+        * the other end to connect back to us and make a slot, and will
+        * cause the other nodes to do the same when they receive the new
+        * row.
+        *
+        * It makes no sense to do this with UDR, where the peer doesn't
+        * connect back to us.
+        */
+       if (local_conn_config != NULL)
+       {
+           elog(DEBUG1, "inserting our connection into into remote end");
+           bdr_insert_remote_conninfo(nonrepl_init_conn, local_conn_config);
+       }
+
+       /*
+        * Wait for all outbound and inbound slot creation to be complete.
+        *
+        * The inbound slots aren't yet required to relay local writes to
+        * remote nodes, but they'll be used to write our catchup
+        * confirmation request WAL message, so we need them to exist.
+        *
+        * This makes no sense on UDR, where the init target doesn't
+        * connect back to us and no other inbound or outbound connections
+        * exist. It still gets run, but we won't find any inbound
+        * slots to look for.
+        */
+       elog(DEBUG1, "waiting for all inbound slots to be created");
+       bdr_init_wait_for_slot_creation();
  
-       elog(INFO, "bdr %s: catchup worker finished, ready for normal replication",
-            NameStr(*dbname));
+       /*
+        * We now have inbound and outbound slots for all nodes, and
+        * we're caught up to a reasonably recent state from the target
+        * node thanks to the dump and catchup mode operation.
+        *
+        * Set the node state to 'r'eady and allow writes.
+        *
+        * TODO: Before we can really be sure we're ready we should be
+        * sending a replay confirmation request and waiting for all
+        * nodes to reply, so we know we have full communication.
+        */
+       status = 'r';
+       bdr_nodes_set_local_status(status);
+       elog(INFO, "finished init_replica, ready to enter normal replication");
     }
-   PG_END_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
-           PointerGetDatum(&nonrepl_init_conn));
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&nonrepl_init_conn));
+
+   Assert(status == 'r');
  
     PQfinish(nonrepl_init_conn);
  }
@@ -1103,32 +1126,27 @@ bdr_catchup_to_lsn_cleanup(int code, Datum offset)
   * When we finish applying and the worker exits, we'll be caught up with the
   * remote and in a consistent state where all our local replication identifiers
   * are consistent with the actual state of the local DB.
- *
- * Arguments:
- *
- * cfg_index: Index of the bdr connection for this dbname with init_worker=t
- * set within bdr_connection_configs. Used to start the worker.
- *
- * target_lsn: LSN of immediate origin node at which catchup should stop.
   */
  static void
-bdr_catchup_to_lsn(int cfg_index,
-                  XLogRecPtr target_lsn)
+bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn)
  {
     uint32 worker_shmem_idx;
     BdrWorker *worker;
-   BdrConnectionConfig *cfg;
-
-   cfg = bdr_connection_configs[cfg_index];
-   Assert(cfg != NULL);
-   Assert(cfg->init_replica);
+   BdrApplyWorker *catchup_worker;
  
-   elog(DEBUG1, "Registering bdr apply catchup worker %s for db %s to lsn %X/%X",
-        cfg->name, cfg->dbname,
+   elog(DEBUG1, "Registering bdr apply catchup worker for "BDR_LOCALID_FORMAT" to lsn %X/%X",
+        ri->sysid, ri->timeline, ri->dboid, EMPTY_REPLICATION_NAME,
          (uint32)(target_lsn>>32), (uint32)target_lsn);
  
     /* Create the shmem entry for the catchup worker */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
     worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &worker_shmem_idx);
+   catchup_worker = &worker->data.apply;
+   catchup_worker->dboid = MyDatabaseId;
+   catchup_worker->remote_sysid = ri->sysid;
+   catchup_worker->remote_timeline = ri->timeline;
+   catchup_worker->remote_dboid = ri->dboid;
+   LWLockRelease(BdrWorkerCtl->lock);
  
     /*
      * Launch the catchup worker, ensuring that we free the shmem slot for the
@@ -1146,10 +1164,7 @@ bdr_catchup_to_lsn(int cfg_index,
         BackgroundWorkerHandle *bgw_handle;
         pid_t bgw_pid;
         pid_t prev_bgw_pid = 0;
-       BdrApplyWorker *catchup_worker = &worker->data.apply;
-
-       /* Make sure the catchup worker can find its bdr.xxx_ GUCs */
-       catchup_worker->connection_config_idx = cfg_index;
+       uint32 worker_arg;
  
         /* Special parameters for a catchup worker only */
         catchup_worker->replay_stop_lsn = target_lsn;
@@ -1164,14 +1179,16 @@ bdr_catchup_to_lsn(int cfg_index,
         strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
  
         bgw.bgw_restart_time = BGW_NEVER_RESTART;
+       Assert(MyProc->pid != 0);
         bgw.bgw_notify_pid = MyProc->pid;
-       bgw.bgw_main_arg = Int32GetDatum(worker_shmem_idx);
+
+       Assert(worker_shmem_idx <= UINT16_MAX);
+       worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_shmem_idx;
+       bgw.bgw_main_arg = Int32GetDatum(worker_arg);
  
         snprintf(bgw.bgw_name, BGW_MAXLEN,
-                "bdr %s: catchup apply to %X/%X on %s",
-                cfg->dbname,
-                (uint32)(target_lsn >> 32), (uint32)target_lsn,
-                cfg->name);
+                "bdr: catchup apply to %X/%X",
+                (uint32)(target_lsn >> 32), (uint32)target_lsn);
         bgw.bgw_name[BGW_MAXLEN-1] = '\0';
  
         /* Launch the catchup worker and wait for it to start */
@@ -1228,14 +1245,12 @@ bdr_catchup_to_lsn(int cfg_index,
         {
             /* Worker must've died before it finished */
             elog(ERROR,
-                "bdr %s: catchup worker exited before catching up to target LSN %X/%X",
-                cfg->dbname,
+                "catchup worker exited before catching up to target LSN %X/%X",
                  (uint32)(target_lsn>>32), (uint32)target_lsn);
         }
         else
         {
-           elog(DEBUG1, "bdr %s: catchup worker caught up to target LSN",
-                cfg->dbname);
+           elog(DEBUG1, "catchup worker caught up to target LSN");
         }
     }
     PG_END_ENSURE_ERROR_CLEANUP(bdr_catchup_to_lsn_cleanup,
diff --git a/bdr_internal.h b/bdr_internal.h

index 9798731cd9f796da265dcf254478b7868a1c8069..8c529511990f290cfb07c7456e60f976d0537b8d 100644 (file)
--- a/bdr_internal.h
+++ b/bdr_internal.h
@@ -14,27 +14,37 @@
  
  #include "lib/ilist.h"
  
+#define EMPTY_REPLICATION_NAME ""
  #define BDR_SLOT_NAME_FORMAT "bdr_%u_%s_%u_%u__%s"
  #define BDR_NODE_ID_FORMAT "bdr_"UINT64_FORMAT"_%u_%u_%u_%s"
  
-/* GUC storage for a configured BDR connection. */
+/* A configured BDR connection from bdr_connections */
  typedef struct BdrConnectionConfig
  {
-   char *dsn;
-   int   apply_delay;
-   bool  init_replica;
-   char *replica_local_dsn;
-   char *replication_sets;
+   uint64      sysid;
+   TimeLineID  timeline;
+   Oid         dboid;
  
     /*
-    * These aren't technically GUCs, but are per-connection config
-    * information obtained from the GUCs.
+    * If the origin_ id fields are set then they must refer to our node,
+    * otherwise we wouldn't load the configuration entry. So if origin_is_set
+    * is false the origin was zero, and if true the origin is the local node
+    * id.
      */
-   char *name;
-   char *dbname;
+   bool origin_is_my_id;
  
-   /* Connection config might be broken (blank dsn, etc) */
-   bool is_valid;
+   /*
+    * Is this connection unidirectional, or should we expect a reciprocal
+    * inbound connection and slot?
+    */
+   bool is_unidirectional;
+
+   char *dsn;
+
+   int   apply_delay;
+
+   /* Quoted identifier-list of replication sets */
+   char *replication_sets;
  } BdrConnectionConfig;
  
  typedef struct BdrFlushPosition
@@ -49,5 +59,15 @@ extern volatile sig_atomic_t got_SIGHUP;
  
  extern void bdr_error_nodeids_must_differ(uint64 sysid, TimeLineID timeline,
                                           Oid dboid);
+extern List* bdr_read_connection_configs(void);
+extern BdrConnectionConfig* bdr_get_connection_config(uint64 sysid,
+                                                     TimeLineID timeline,
+                                                     Oid dboid,
+                                                     bool missing_ok);
+
+extern void bdr_free_connection_config(BdrConnectionConfig *cfg);
+
+extern void bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+                         Oid dboid, Oid local_dboid);
  
  #endif   /* BDR_INTERNAL_H */
diff --git a/bdr_isolationregress.conf b/bdr_isolationregress.conf

index bd2b7cb0781597fbe6a11cc8212c834c1c437e34..f29108ffe5454b34e798a44b15666388f532b739 100644 (file)
--- a/bdr_isolationregress.conf
+++ b/bdr_isolationregress.conf
@@ -2,32 +2,11 @@ include = 'bdr_regress_common.conf'
  
  track_commit_timestamp = on
  
-bdr.connections = 'node1to2,node1to3,node2to3,node2to1,node3to1,node3to2'
-
-bdr.node1to2_dsn = 'dbname=node2'
-bdr.node1to2_local_dbname = 'node1'
-bdr.node1to3_dsn = 'dbname=node3'
-bdr.node1to3_local_dbname = 'node1'
-
-bdr.node2to1_dsn = 'dbname=node1'
-bdr.node2to1_local_dbname = 'node2'
-#bdr.node2to1_init_replica=on
-#bdr.node2to1_replica_local_dsn='dbname=node2'
-bdr.node2to3_dsn = 'dbname=node3'
-bdr.node2to3_local_dbname = 'node2'
-
-bdr.node3to1_dsn = 'dbname=node1'
-bdr.node3to1_local_dbname = 'node3'
-#bdr.node3to1_init_replica=on
-#bdr.node3to1_replica_local_dsn='dbname=node3'
-bdr.node3to2_dsn = 'dbname=node2'
-bdr.node3to2_local_dbname = 'node3'
-
  bdr.log_conflicts_to_table = True
  bdr.default_apply_delay = 100
  
  #log_min_messages = 'debug4'
  #log_line_prefix = 'd=%d p=%p a=%a%q '
-log_statement = 'all'
+#log_statement = 'all'
  
  max_worker_processes = 18
diff --git a/bdr_label.c b/bdr_label.c

index ed20add9d402ee9edc68abfc2e304b5b1f216d4a..c698238a179089e001ce614ab56e1ad9f8e8da0d 100644 (file)
--- a/bdr_label.c
+++ b/bdr_label.c
@@ -36,7 +36,7 @@ void
  bdr_label_init(void)
  {
     /* Security label provider hook */
-   register_label_provider("bdr", bdr_object_relabel);
+   register_label_provider(BDR_SECLABEL_PROVIDER, bdr_object_relabel);
  }
  
  static void
diff --git a/bdr_label.h b/bdr_label.h

index 0103a2a98e8de8e0168e25f13eba141cd938aac0..8cba0736b4db0df675bf1dc76245e6ccc51f444f 100644 (file)
--- a/bdr_label.h
+++ b/bdr_label.h
@@ -8,4 +8,6 @@
   * bdr_label.h
   */
  
+#define BDR_SECLABEL_PROVIDER "bdr"
+
  extern void bdr_label_init(void);
diff --git a/bdr_locks.c b/bdr_locks.c

index 933afb08f46af1b831bdbe0064df70ac3f495105..a6fa042d47401c6e3d3887025e3ee62e6c2cb866 100644 (file)
--- a/bdr_locks.c
+++ b/bdr_locks.c
@@ -147,9 +147,6 @@ static BdrLocksCtl *bdr_locks_ctl;
  /* shmem init hook to chain to on startup, if any */
  static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
  
-/* number of per database slots */
-static int bdr_locks_num_databases;
-
  /* this database's state */
  static BdrLocksDBState *bdr_my_locks_database = NULL;
  
@@ -161,7 +158,7 @@ bdr_locks_shmem_size(void)
     Size        size = 0;
  
     size = add_size(size, sizeof(BdrLocksCtl));
-   size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_locks_num_databases));
+   size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_max_databases));
  
     return size;
  }
@@ -188,13 +185,12 @@ bdr_locks_shmem_startup(void)
  
  /* Needs to be called from a shared_preload_library _PG_init() */
  void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
  {
     /* Must be called from postmaster its self */
     Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
  
     bdr_locks_ctl = NULL;
-   bdr_locks_num_databases = num_used_databases;
  
     RequestAddinShmemSpace(bdr_locks_shmem_size());
     RequestAddinLWLocks(1);
@@ -212,7 +208,7 @@ bdr_locks_find_database(Oid dboid, bool create)
     int off;
     int free_off = -1;
  
-   for(off = 0; off < bdr_locks_num_databases; off++)
+   for(off = 0; off < bdr_max_databases; off++)
     {
         BdrLocksDBState *db = &bdr_locks_ctl->dbstate[off];
  
@@ -243,14 +239,11 @@ bdr_locks_find_database(Oid dboid, bool create)
         db->in_use = true;
         return db;
     }
-   /*
-    * Shouldn't happen with BDR statically configured, as the shmem segment
-    * gets sized for the number of BDR-enabled databases. Later will be
-    * affected by any bdr_max_databases setting or whatever we add.
-    */
-   ereport(PANIC,
+
+   ereport(ERROR,
             (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
-           "Too many databases in use with BDR"));
+           errmsg("Too many databases BDR-enabled for bdr.max_databases"),
+           errhint("Increase bdr.max_databases above the current limit of %d", bdr_max_databases)));
  }
  
  static void
@@ -273,7 +266,7 @@ bdr_locks_find_my_database(bool create)
   * Called from the per-db worker.
   */
  void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
  {
     Relation        rel;
     SysScanDesc     scan;
@@ -296,7 +289,8 @@ bdr_locks_startup(Size nnodes)
     if (bdr_my_locks_database->locked_and_loaded)
         return;
  
-   bdr_my_locks_database->nnodes = nnodes;
+   /* We haven't yet established how many nodes we're connected to. */
+   bdr_my_locks_database->nnodes = 0;
  
     initStringInfo(&s);
  
@@ -387,6 +381,42 @@ bdr_locks_startup(Size nnodes)
     bdr_my_locks_database->locked_and_loaded = true;
  }
  
+void
+bdr_locks_set_nnodes(Size nnodes)
+{
+   Assert(IsBackgroundWorker);
+   Assert(bdr_my_locks_database != NULL);
+
+   /*
+    * XXX DYNCONF No protection against node addition during DDL lock acquire
+    *
+    * Node counts are currently grabbed straight from the perdb worker's shmem
+    * and could change whenever someone adds a worker, with no locking or
+    * protection.
+    *
+    * We could acquire the local DDL lock before setting the nodecount, which
+    * would cause requests from other nodes to get rejected and cause other
+    * local tx's to fail to request the global DDL lock. However, we'd have to
+    * acquire it when we committed to adding the new worker, which happens in
+    * a user backend, and release it from the perdb worker once the new worker
+    * is registered. Fragile.
+    *
+    * Doing so also fails to solve the other half of the problem, which is
+    * that DDL locking expects there to be one bdr walsender for each apply
+    * worker, i.e. each connection should be reciprocal. We could connect to
+    * the other end and register a connection back to us, but that's getting
+    * complicated for what's always going to be a temporary option before a
+    * full part/join protocol is added.
+    *
+    * So we're just going to cross our fingers. Worst case is that DDL locking
+    * gets stuck and we have to restart all the nodes.
+    *
+    * The full part/join protocol will solve this by acquiring the DDL lock
+    * before joining.
+    */
+   bdr_my_locks_database->nnodes = nnodes;
+}
+
  
  static void
  bdr_prepare_message(StringInfo s, BdrMessageType message_type)
@@ -506,6 +536,14 @@ bdr_acquire_ddl_lock(void)
  
     bdr_locks_find_my_database(false);
  
+   if (bdr_my_locks_database->nnodes == 0)
+   {
+       ereport(ERROR,
+               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                errmsg("No peer nodes or peer node count unknown, cannot acquire DDL lock"),
+                errhint("BDR is probably still starting up, wait a while")));
+   }
+
     elog(DEBUG2, "attempting to acquire global DDL lock for (" BDR_LOCALID_FORMAT ")", BDR_LOCALID_FORMAT_ARGS);
  
     /* send message about ddl lock */
@@ -996,7 +1034,7 @@ bdr_process_decline_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
   * Another node has asked us to confirm that we've replayed up to a given LSN.
   * We've seen the request message, so send the requested confirmation.
   *
- * Runs in the walsender.
+ * Runs in the apply worker.
   */
  void
  bdr_process_request_replay_confirm(uint64 sysid, TimeLineID tli,
@@ -1257,12 +1295,12 @@ bdr_locks_check_query(void)
  
  /* bdr_locks are not used by UDR at the moment */
  void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
  {
  }
  
  void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
  {
  }
  
diff --git a/bdr_locks.h b/bdr_locks.h

index 5549089d119535cc9d7d3c961db511d82e5e4671..4b855f3bbcea76cd61ee3ecf2ad35e5fdc26c9ad 100644 (file)
--- a/bdr_locks.h
+++ b/bdr_locks.h
@@ -21,7 +21,8 @@ typedef enum BdrMessageType
     BDR_MESSAGE_REPLAY_CONFIRM = 6
  } BdrMessageType;
  
-void bdr_locks_startup(Size nnodes);
+void bdr_locks_startup(void);
+void bdr_locks_set_nnodes(Size nnodes);
  void bdr_acquire_ddl_lock(void);
  void bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid);
  void bdr_process_release_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid,
diff --git a/bdr_output.c b/bdr_output.c

index 90606c0f1ad3dc3a5dd3cbf4f8aebadf4be2caf3..e6d63355598b8dfea03d822cecb9cdf00270d223 100644 (file)
--- a/bdr_output.c
+++ b/bdr_output.c
@@ -74,6 +74,7 @@ typedef struct
     bool client_float8_byval;
     bool client_int_datetime;
     char *client_db_encoding;
+   bool client_unidirectional;
     Oid bdr_schema_oid;
     Oid bdr_conflict_handlers_reloid;
     Oid bdr_locks_reloid;
@@ -228,14 +229,19 @@ bdr_req_param(const char *param)
   * If this function returns it's safe to begin replay.
   */
  static void
-bdr_ensure_node_ready()
+bdr_ensure_node_ready(BdrOutputData *data)
  {
     int spi_ret;
     const uint64 sysid = GetSystemIdentifier();
     char status;
+   BDRNodeInfo *node;
     NameData dbname;
     char *tmp_dbname;
  
+   /* Unidirectional connections don't require any checks atm. */
+   if (data->client_unidirectional)
+       return;
+
     /* We need dbname valid outside this transaction, so copy it */
     tmp_dbname = get_database_name(MyDatabaseId);
     strncpy(NameStr(dbname), tmp_dbname, NAMEDATALEN);
@@ -250,32 +256,18 @@ bdr_ensure_node_ready()
     if (spi_ret != SPI_OK_CONNECT)
         elog(ERROR, "Local SPI connect failed; shouldn't happen");
  
-   status = bdr_nodes_get_local_status(sysid, ThisTimeLineID, MyDatabaseId);
+   node = bdr_nodes_get_local_info(sysid, ThisTimeLineID, MyDatabaseId);
+   status = node == NULL ? '\0' : node->status;
+   bdr_bdr_node_free(node);
  
     SPI_finish();
  
-/*
- * There is no local node status for UDR as we have only connection to this
- * node coming from a slave. The above is still useful to make sure the
- * extension is installed in the db.
- */
-#ifdef BUILDING_UDR
-   switch (status)
-   {
-       case 'r':
-       case '\0':
-       case 'c':
-       case 'i':
-           break;
-       default:
-           elog(ERROR, "Unhandled case status=%c", status);
-           break;
-   }
-#else
-
-   /* Complain if node isn't ready. */
+   /*
+    * Complain if node isn't ready,
+    * i.e. state is fully 'r'eady, or waiting for inbound sl'o't creation.
+    */
     /* TODO: Allow soft error so caller can sleep and recheck? */
-   if (status != 'r')
+   if (status != 'r' && status != 'o')
     {
         const char * const base_msg =
             "bdr output plugin: slot creation rejected, bdr.bdr_nodes entry for local node (sysid=" UINT64_FORMAT
@@ -283,8 +275,10 @@ bdr_ensure_node_ready()
         switch (status)
         {
             case 'r':
+           case 'o':
                 break; /* unreachable */
             case '\0':
+           case 'b':
                 /*
                  * Can't allow replay when BDR hasn't started yet, as
                  * replica init might still need to run, causing a dump to
@@ -338,7 +332,6 @@ bdr_ensure_node_ready()
                 break;
         }
     }
-#endif
  }
  
  
@@ -411,6 +404,8 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
             data->client_db_encoding = pstrdup(strVal(elem->arg));
         else if (strcmp(elem->defname, "forward_changesets") == 0)
             bdr_parse_bool(elem, &data->forward_changesets);
+       else if (strcmp(elem->defname, "unidirectional") == 0)
+           bdr_parse_bool(elem, &data->client_unidirectional);
         else if (strcmp(elem->defname, "replication_sets") == 0)
         {
             int i;
@@ -482,12 +477,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
         StartTransactionCommand();
     }
  
-#ifdef BUILDING_BDR
-   /*
-    * If running BDR, we expect the remote end (us) to have the BDR extension
-    * installed before we permit slot creation. This prevents replication of
-    * the CREATE EXTENSION bdr; command its self.
-    */
+   /* BDR extension must be installed. */
     if (get_namespace_oid("bdr", true) == InvalidOid)
     {
         ereport(ERROR,
@@ -496,7 +486,6 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
                         BDR_LOCALID_FORMAT_ARGS),
                  errdetail("Cannot create a BDR slot without the BDR extension installed")));
     }
-#endif
  
     /* no options are passed in during initialization, so don't complain there */
     if (!is_init)
@@ -521,6 +510,15 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
         if (data->client_db_encoding == NULL)
             bdr_req_param("db_encoding");
  
+#ifdef BUILDING_UDR
+       /* Can't do bidirectional connection on UDR. */
+       if (!data->is_unidirectional)
+           ereport(ERROR,
+                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                    errmsg("UDR only supports unidirectional connections")));
+
+#endif
+
         /* check incompatibilities we cannot work around */
         if (strcmp(data->client_db_encoding, GetDatabaseEncodingName()) != 0)
             elog(ERROR, "mismatching encodings are not yet supported");
@@ -583,7 +581,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
         if (data->client_pg_version / 100 != PG_VERSION_NUM / 100)
             data->allow_sendrecv_protocol = false;
  
-       bdr_maintain_schema();
+       bdr_maintain_schema(false);
  
         data->bdr_schema_oid = get_namespace_oid("bdr", true);
         schema_oid = data->bdr_schema_oid;
@@ -618,7 +616,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
          * This'll ERROR out if we're not ready. Note that this does NOT
          * prevent slot creation, only START_REPLICATION from the slot.
          */
-       bdr_ensure_node_ready();
+       bdr_ensure_node_ready(data);
     }
  
     if (tx_started)
diff --git a/bdr_perdb.c b/bdr_perdb.c

index 644de7fa6a6d2a3f4070b7456798f2a94f5a5336..0f5c72d48012753cd066f99ca0d7daee397a819b 100644 (file)
--- a/bdr_perdb.c
+++ b/bdr_perdb.c
@@ -44,92 +44,423 @@
  #include "utils/memutils.h"
  #include "utils/snapmgr.h"
  
+PG_FUNCTION_INFO_V1(bdr_connections_changed);
+
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS);
+
+/* In the commit hook, should we attempt to start a per-db worker? */
+static bool xacthook_connection_added = false;
+
+/*
+ * Scan shmem looking for a perdb worker for the named DB and
+ * return its offset. If not found, return -1.
+ *
+ * Must hold the LWLock on the worker control segment in at
+ * least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually
+ * started up.
+ */
+int
+find_perdb_worker_slot(Oid dboid, BdrWorker **worker_found)
+{
+   int i, found = -1;
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   for (i = 0; i < bdr_max_workers; i++)
+   {
+       BdrWorker *w = &BdrWorkerCtl->slots[i];
+       if (w->worker_type == BDR_WORKER_PERDB)
+       {
+           BdrPerdbWorker *pw = &w->data.perdb;
+           if (pw->database_oid == dboid)
+           {
+               found = i;
+               if (worker_found != NULL)
+                   *worker_found = w;
+               break;
+           }
+       }
+   }
+
+   return found;
+}
+
+/*
+ * Scan shmem looking for an apply worker for the current perdb worker and
+ * specified target node identifier and return its offset. If not found, return
+ * -1.
+ *
+ * Must hold the LWLock on the worker control segment in at least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually started up.
+ */
+static int
+find_apply_worker_slot(uint64 sysid, TimeLineID timeline, Oid dboid, BdrWorker **worker_found)
+{
+   int i, found = -1;
+
+   Assert(bdr_worker_type == BDR_WORKER_PERDB);
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   for (i = 0; i < bdr_max_workers; i++)
+   {
+       BdrWorker *w = &BdrWorkerCtl->slots[i];
+       if (w->worker_type == BDR_WORKER_APPLY)
+       {
+           BdrApplyWorker *aw = &w->data.apply;
+           if (aw->dboid == MyDatabaseId
+               && aw->remote_sysid == sysid
+               && aw->remote_timeline == timeline
+               && aw->remote_dboid == dboid)
+           {
+               found = i;
+               if (worker_found != NULL)
+                   *worker_found = w;
+               break;
+           }
+       }
+   }
+
+   return found;
+}
+
+static void
+bdr_perdb_xact_callback(XactEvent event, void *arg)
+{
+   switch (event)
+   {
+       case XACT_EVENT_COMMIT:
+           if (xacthook_connection_added)
+           {
+               int slotno;
+               BdrWorker *w;
+
+               xacthook_connection_added = false;
+
+               LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+               /*
+                * If a perdb worker already exists, wake it and tell it to
+                * check for new connections.
+                */
+               slotno = find_perdb_worker_slot(MyDatabaseId, &w);
+               if (slotno >= 0)
+               {
+                   /*
+                    * The worker is registered, but might not be started yet
+                    * (or could be crashing and restarting). If it's not
+                    * started the latch will be zero. If it's started but
+                    * dead, the latch will be bogus, but it's safe to set a
+                    * proclatch to a dead process. At worst we'll set a latch
+                    * for the wrong process, and that's fine. If it's zero
+                    * then the worker is still starting and will see our new
+                    * changes anyway.
+                    */
+                   if (w->data.perdb.proclatch != NULL)
+                       SetLatch(w->data.perdb.proclatch);
+               }
+               else
+               {
+                   /*
+                    * Per-db worker doesn't exist, ask the supervisor to check for
+                    * changes and register new per-db workers for labeled
+                    * databases.
+                    */
+                   if (BdrWorkerCtl->supervisor_latch)
+                       SetLatch(BdrWorkerCtl->supervisor_latch);
+               }
+
+               LWLockRelease(BdrWorkerCtl->lock);
+           }
+           break;
+       default:
+           /* We're not interested in other tx events */
+           break;
+   }
+}
+
+/*
+ * Prepare to launch a perdb worker for the current DB if it's not already
+ * running, and register a XACT_EVENT_COMMIT hook to perform the actual launch
+ * when the addition of the worker commits.
+ *
+ * If a perdb worker is already running, notify it to check for new connections.
+ */
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS)
+{
+   /* If there's already a per-db worker for our DB we have nothing to do */
+   if (!xacthook_connection_added)
+   {
+       RegisterXactCallback(bdr_perdb_xact_callback, NULL);
+       xacthook_connection_added = true;
+   }
+   PG_RETURN_VOID();
+}
+
+static int
+getattno(const char *colname)
+{
+   int attno;
+
+   attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+   if (attno == SPI_ERROR_NOATTRIBUTE)
+       elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+   return attno;
+}
+
  /*
   * Launch a dynamic bgworker to run bdr_apply_main for each bdr connection on
   * the database identified by dbname.
   *
- * Scans the BdrWorkerCtl shmem segment for workers of type BDR_WORKER_APPLY
- * with a matching database name and launches them.
+ * Scans the bdr.bdr_connections table for workers and launch a worker for any
+ * connection that doesn't already have one.
   */
-static List*
-bdr_launch_apply_workers(char *dbname)
+void
+bdr_launch_apply_workers(Oid dboid)
  {
-   List             *apply_workers = NIL;
-   BackgroundWorker  apply;
-   int               i;
-
+   BackgroundWorker    bgw;
+   int                 i, ret;
+   Size                nnodes = 0;
+#define BDR_CON_Q_NARGS 3
+   Oid                 argtypes[BDR_CON_Q_NARGS] = { TEXTOID, OIDOID, OIDOID };
+   Datum               values[BDR_CON_Q_NARGS];
+   char                sysid_str[33];
+
+   /* Should be called from the perdb worker */
     Assert(IsBackgroundWorker);
+   Assert(bdr_worker_type == BDR_WORKER_PERDB);
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   elog(DEBUG2, "launching apply workers");
+
+   /*
+    * It's easy enough to make this tolerant of an open tx, but in general
+    * rollback doesn't make sense here.
+    */
+   Assert(!IsTransactionState());
  
     /* Common apply worker values */
-   apply.bgw_flags = BGWORKER_SHMEM_ACCESS |
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
         BGWORKER_BACKEND_DATABASE_CONNECTION;
-   apply.bgw_start_time = BgWorkerStart_RecoveryFinished;
-   apply.bgw_main = NULL;
-   strncpy(apply.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
-   strncpy(apply.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
-   apply.bgw_restart_time = 5;
-   apply.bgw_notify_pid = 0;
-
-   /* Launch apply workers */
-   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-   for (i = 0; i < bdr_max_workers; i++)
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 5;
+   bgw.bgw_notify_pid = 0;
+
+   StartTransactionCommand();
+
+   /*
+    * Look up connection entries for all nodes other than our own.
+    *
+    * If an entry with our origin (sysid,tlid,dboid) exists, treat that as
+    * overriding the generic one.
+    */
+   values[0] = CStringGetTextDatum(sysid_str);
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+   SPI_connect();
+
+   ret = SPI_execute_with_args(
+           "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+           "  conn_sysid, conn_timeline, conn_dboid, "
+           "  conn_is_unidirectional, "
+           "  conn_origin_dboid <> 0 AS origin_is_my_id "
+           "FROM bdr.bdr_connections "
+           "WHERE ( "
+           "         (conn_origin_sysid = '0' AND "
+           "          conn_origin_timeline = 0 AND "
+           "          conn_origin_dboid = 0) "
+           "         OR "
+           "         (conn_origin_sysid = $1 AND "
+           "          conn_origin_timeline = $2 AND "
+           "          conn_origin_dboid = $3) "
+           "      ) AND NOT ( "
+           "          conn_sysid = $1 AND "
+           "          conn_timeline = $2 AND "
+           "          conn_dboid = $3"
+           "      ) "
+           "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+           "         conn_origin_sysid ASC NULLS LAST, "
+           "         conn_timeline ASC NULLS LAST, "
+           "         conn_dboid ASC NULLS LAST ",
+       BDR_CON_Q_NARGS, argtypes, values, NULL,
+       false, 0);
+
+   if (ret != SPI_OK_SELECT)
+       elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+   nnodes = SPI_processed;
+
+   elog(DEBUG2, "found %u workers in bdr_connections", (uint32)nnodes);
+
+   for (i = 0; i < SPI_processed; i++)
     {
-       BdrWorker *worker = &BdrWorkerCtl->slots[i];
+       BackgroundWorkerHandle *bgw_handle;
+       HeapTuple               tuple;
+       uint32                  slot;
+       uint32                  worker_arg;
+       BdrWorker              *worker;
+       BdrApplyWorker         *apply;
+       Datum                   temp_datum;
+       bool                    isnull;
+       uint64                  target_sysid;
+       TimeLineID              target_timeline;
+       Oid                     target_dboid;
+       char*                   tmp_sysid;
+       bool                    origin_is_my_id,
+                               conn_is_unidirectional;
+
+       tuple = SPI_tuptable->vals[i];
+
+       tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+                                getattno("conn_sysid"));
+
+       if (sscanf(tmp_sysid, UINT64_FORMAT, &target_sysid) != 1)
+           elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_timeline"),
+                                  &isnull);
+       Assert(!isnull);
+       target_timeline = DatumGetObjectId(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_dboid"),
+                                  &isnull);
+       Assert(!isnull);
+       target_dboid = DatumGetObjectId(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_is_unidirectional"),
+                                  &isnull);
+       Assert(!isnull);
+       conn_is_unidirectional = DatumGetBool(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("origin_is_my_id"),
+                                  &isnull);
+       Assert(!isnull);
+       origin_is_my_id = DatumGetBool(temp_datum);
+
+       elog(DEBUG2, "Found bdr_connections entry for "BDR_LOCALID_FORMAT" (origin specific: %d, unidirectional: %d)",
+            target_sysid, target_timeline, target_dboid,
+            EMPTY_REPLICATION_NAME, (int)origin_is_my_id, (int)conn_is_unidirectional);
+
+       Assert(!LWLockHeldByMe(BdrWorkerCtl->lock));
+       LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
  
-       switch(worker->worker_type)
+       /*
+        * Is there already a worker registered for this connection?
+        *
+        * TODO DYNCONF Each apply worker should have its latch set and respond
+        * by checking to see whether it needs to apply any new configuration.
+        */
+       if (find_apply_worker_slot(target_sysid, target_timeline, target_dboid, NULL) != -1)
         {
-           case BDR_WORKER_APPLY:
-               {
-                   BdrApplyWorker *con = &worker->data.apply;
-                   BdrConnectionConfig *cfg =
-                       bdr_connection_configs[con->connection_config_idx];
-                   Assert(cfg != NULL);
-                   if ( strcmp(cfg->dbname, dbname) == 0 )
-                   {
-                       /* It's an apply worker for our DB; register it */
-                       BackgroundWorkerHandle *bgw_handle;
-
-                       if (con->bgw_is_registered)
-                           /*
-                            * This worker was registered on a previous pass;
-                            * this is probably a restart of the per-db worker.
-                            * Don't register a duplicate.
-                            */
-                           continue;
-
-                       snprintf(apply.bgw_name, BGW_MAXLEN,
-                                BDR_LOCALID_FORMAT": %s: apply",
-                                BDR_LOCALID_FORMAT_ARGS, cfg->name);
-                       apply.bgw_main_arg = Int32GetDatum(i);
-
-                       if (!RegisterDynamicBackgroundWorker(&apply,
-                                                            &bgw_handle))
-                       {
-                           ereport(ERROR,
-                                   (errmsg("bdr: Failed to register background worker"
-                                           " %s, see previous log messages",
-                                           cfg->name)));
-                       }
-                       /* We've launched this one, don't do it again */
-                       con->bgw_is_registered = true;
-                       apply_workers = lcons(bgw_handle, apply_workers);
-                   }
-               }
-               break;
-           case BDR_WORKER_EMPTY_SLOT:
-           case BDR_WORKER_PERDB:
-               /* Nothing to do; switch only so we get warnings for insane cases */
-               break;
-           default:
-               /* Bogus value */
-               elog(FATAL, "Unhandled BdrWorkerType case %i, memory corruption?",
-                    worker->worker_type);
-               break;
+           elog(DEBUG2, "Skipping registration of worker for node "BDR_LOCALID_FORMAT" on db oid=%u: already registered",
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME, dboid);
+           LWLockRelease(BdrWorkerCtl->lock);
+           continue;
+       }
+
+       /* Set the display name in 'ps' etc */
+       snprintf(bgw.bgw_name, BGW_MAXLEN,
+                BDR_LOCALID_FORMAT"->"BDR_LOCALID_FORMAT,
+                BDR_LOCALID_FORMAT_ARGS,
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME);
+
+       /* Allocate a new shmem slot for this apply worker */
+       worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &slot);
+
+       /* Tell the apply worker what its shmem slot is */
+       Assert(slot <= UINT16_MAX);
+       worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)slot;
+       bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+       /*
+        * Apply workers (other than in catchup mode, which are registered
+        * elsewhere) should not be using the local node's connection entry.
+        */
+       Assert(!(target_sysid == GetSystemIdentifier() &&
+                target_timeline == ThisTimeLineID &&
+                target_dboid == MyDatabaseId));
+
+       /* Now populate the apply worker state */
+       apply = &worker->data.apply;
+       apply->dboid = MyDatabaseId;
+       apply->remote_sysid = target_sysid;
+       apply->remote_timeline = target_timeline;
+       apply->remote_dboid = target_dboid;
+       apply->replay_stop_lsn = InvalidXLogRecPtr;
+       apply->forward_changesets = false;
+
+       LWLockRelease(BdrWorkerCtl->lock);
+
+       /*
+        * Finally, register the worker for launch.
+        */
+       if (!RegisterDynamicBackgroundWorker(&bgw,
+                                            &bgw_handle))
+       {
+           /*
+            * Already-registered workers will keep on running.  We need to
+            * make sure the slot we just acquired but failed to launch a
+            * worker for gets released again though.
+            */
+           LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+           apply->dboid = InvalidOid;
+           apply->remote_sysid = 0;
+           apply->remote_timeline = 0;
+           apply->remote_dboid = InvalidOid;
+           worker->worker_type = BDR_WORKER_EMPTY_SLOT;
+           LWLockRelease(BdrWorkerCtl->lock);
+
+           ereport(ERROR,
+                   (errmsg("bdr: Failed to register background worker"
+                           " for "BDR_LOCALID_FORMAT", see previous log messages",
+                           BDR_LOCALID_FORMAT_ARGS)));
+       }
+       else
+       {
+           elog(DEBUG2, "registered apply worker for "BDR_LOCALID_FORMAT,
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME);
         }
     }
-   LWLockRelease(BdrWorkerCtl->lock);
  
-   return apply_workers;
+   SPI_finish();
+
+   CommitTransactionCommand();
+
+   elog(DEBUG2, "done registering apply workers");
+
+   /*
+    * Now we need to tell the lock manager and the sequence
+    * manager about the changed node count.
+    *
+    * There's no truly safe way to do this without a proper
+    * part/join protocol, so all we're going to do is update
+    * the node count in shared memory.
+    */
+   bdr_worker_slot->data.perdb.nnodes = nnodes;
+#ifdef BUILDING_BDR
+   bdr_locks_set_nnodes(nnodes);
+   bdr_sequencer_set_nnodes(nnodes);
+#endif
+
+   elog(DEBUG2, "updated worker counts");
  }
  
  /*
@@ -148,60 +479,106 @@ bdr_launch_apply_workers(char *dbname)
  void
  bdr_perdb_worker_main(Datum main_arg)
  {
-   int               rc = 0;
-   List             *apply_workers;
-   ListCell         *c;
-   BdrPerdbWorker   *perdb;
-   BdrWorker        *bdr_worker_slot;
-   StringInfoData    si;
-   bool              wait;
+   int                 rc = 0;
+   BdrPerdbWorker      *perdb;
+   StringInfoData      si;
+   bool                wait;
+   uint32              worker_arg;
+   uint16              worker_generation;
+   uint16              perdb_worker_idx;
+   BDRNodeInfo        *local_node;
  
     initStringInfo(&si);
  
     Assert(IsBackgroundWorker);
  
-   bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+   worker_arg = DatumGetInt32(main_arg);
+
+   worker_generation = (uint16)(worker_arg >> 16);
+   perdb_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+   if (worker_generation != BdrWorkerCtl->worker_generation)
+   {
+       elog(DEBUG1, "perdb worker from generation %d exiting after finding shmem generation is %d",
+            worker_generation, BdrWorkerCtl->worker_generation);
+       proc_exit(0);
+   }
+
+   bdr_worker_slot = &BdrWorkerCtl->slots[perdb_worker_idx];
     Assert(bdr_worker_slot->worker_type == BDR_WORKER_PERDB);
     perdb = &bdr_worker_slot->data.perdb;
     bdr_worker_type = BDR_WORKER_PERDB;
  
     bdr_worker_init(NameStr(perdb->dbname));
  
+   perdb->nnodes = 0;
+
     elog(DEBUG1, "per-db worker for node " BDR_LOCALID_FORMAT " starting", BDR_LOCALID_FORMAT_ARGS);
  
-   appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb worker");
+   appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb");
     SetConfigOption("application_name", si.data, PGC_USERSET, PGC_S_SESSION);
  
     CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr seq top-level resource owner");
     bdr_saved_resowner = CurrentResourceOwner;
  
+   /*
+    * It's necessary to acquire a a lock here so that a concurrent
+    * bdr_perdb_xact_callback can't try to set our latch at the same
+    * time as we write to it.
+    *
+    * There's no per-worker lock, so we just take the lock on the
+    * whole segment.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+   perdb->proclatch = &MyProc->procLatch;
+   perdb->database_oid = MyDatabaseId;
+   LWLockRelease(BdrWorkerCtl->lock);
+
     /* need to be able to perform writes ourselves */
     bdr_executor_always_allow_writes(true);
-   bdr_locks_startup(perdb->nnodes);
+   bdr_locks_startup();
+
+   {
+       int             spi_ret;
+       MemoryContext   saved_ctx;
+
+       /*
+        * Check the local bdr.bdr_nodes table to see if there's an entry for
+        * our node.
+        *
+        * Note that we don't have to explicitly SPI_finish(...) on error paths;
+        * that's taken care of for us.
+        */
+       StartTransactionCommand();
+       spi_ret = SPI_connect();
+       if (spi_ret != SPI_OK_CONNECT)
+           elog(ERROR, "SPI already connected; this shouldn't be possible");
+
+       saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+       local_node = bdr_nodes_get_local_info(GetSystemIdentifier(), ThisTimeLineID,
+                                         MyDatabaseId);
+       MemoryContextSwitchTo(saved_ctx);
+
+       if (local_node == NULL)
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("local node record not found")));
+
+       SPI_finish();
+       CommitTransactionCommand();
+   }
  
     /*
      * Do we need to init the local DB from a remote node?
-    *
-    * Checks bdr.bdr_nodes.status, does any remote initialization required if
-    * there's an init_replica connection, and ensures that
-    * bdr.bdr_nodes.status=r for our entry before continuing.
      */
-   bdr_init_replica(&perdb->dbname);
+   if (local_node->status != 'r')
+       bdr_init_replica(local_node);
  
-   elog(DEBUG1, "Starting bdr apply workers for db %s", NameStr(perdb->dbname));
+   elog(DEBUG1, "Starting bdr apply workers for "BDR_LOCALID_FORMAT" (%s)",
+        BDR_LOCALID_FORMAT_ARGS, NameStr(perdb->dbname));
  
     /* Launch the apply workers */
-   apply_workers = bdr_launch_apply_workers(NameStr(perdb->dbname));
-
-   /*
-    * For now, just free the bgworker handles. Later we'll probably want them
-    * for adding/removing/reconfiguring bgworkers.
-    */
-   foreach(c, apply_workers)
-   {
-       BackgroundWorkerHandle *h = (BackgroundWorkerHandle *) lfirst(c);
-       pfree(h);
-   }
+   bdr_launch_apply_workers(MyDatabaseId);
  
  #ifdef BUILDING_BDR
     elog(DEBUG1, "BDR starting sequencer on db \"%s\"",
@@ -260,8 +637,18 @@ bdr_perdb_worker_main(Datum main_arg)
             /* emergency bailout if postmaster has died */
             if (rc & WL_POSTMASTER_DEATH)
                 proc_exit(1);
+
+           if (rc & WL_LATCH_SET)
+           {
+               /*
+                * If the perdb worker's latch is set we're being asked
+                * to rescan and launch new apply workers.
+                */
+               bdr_launch_apply_workers(MyDatabaseId);
+           }
         }
     }
  
+   perdb->database_oid = InvalidOid;
     proc_exit(0);
  }
diff --git a/bdr_regress_bdr.conf b/bdr_regress_bdr.conf

index 47d1fe7978b25c9500f8c714ca8e6d3731e7b5f0..c5ae0546bb612ae0d3edf3c087cacce0de5151dd 100644 (file)
--- a/bdr_regress_bdr.conf
+++ b/bdr_regress_bdr.conf
@@ -2,16 +2,6 @@ track_commit_timestamp = on
  
  include = 'bdr_regress_common.conf'
  
-bdr.connections = 'node1, node2'
-
-bdr.node1_dsn = 'dbname=postgres'
-bdr.node1_local_dbname = 'regression'
-bdr.node1_replication_sets = 'default, important, for-node-1'
-
-bdr.node2_dsn = 'dbname=regression'
-bdr.node2_local_dbname = 'postgres'
-bdr.node2_replication_sets = 'default, important, for-node-2, for-node-2-insert, for-node-2-update, for-node-2-delete'
-
  bdrtest.readdb1 = 'regression'
  bdrtest.readdb2 = 'postgres'
  bdrtest.writedb1 = 'regression'
diff --git a/bdr_relcache.c b/bdr_relcache.c

index e386a10a6b169a48fbc4b021b3afa2792960370e..d0f46c2be11f84dd7b5631cc85936a680c22b314 100644 (file)
--- a/bdr_relcache.c
+++ b/bdr_relcache.c
@@ -16,12 +16,15 @@
  
  #include "bdr.h"
  
+#include "access/genam.h"
  #include "access/heapam.h"
  #include "access/xact.h"
  
  #include "commands/seclabel.h"
  
+#include "utils/builtins.h"
  #include "utils/catcache.h"
+#include "utils/fmgroids.h"
  #include "utils/inval.h"
  #include "utils/jsonapi.h"
  #include "utils/json.h"
@@ -324,10 +327,6 @@ relation_in_replication_set(BDRRelation *r, const char *setname)
     return false;
  }
  
-#include "access/genam.h"
-#include "utils/builtins.h"
-#include "utils/fmgroids.h"
-
  static HeapTuple
  replset_lookup(Relation rel, const char *cname)
  {
diff --git a/bdr_seq.c b/bdr_seq.c

index 0c1742e8eefa9571376046f637173645b137a887..ac9602e029853812e7ec640f906a6ec24d4b563c 100644 (file)
--- a/bdr_seq.c
+++ b/bdr_seq.c
@@ -50,7 +50,7 @@ typedef struct BdrSequencerSlot
  
  typedef struct BdrSequencerControl
  {
-   size_t      slot;
+   int         next_slot;
     BdrSequencerSlot slots[FLEXIBLE_ARRAY_MEMBER];
  } BdrSequencerControl;
  
@@ -429,6 +429,12 @@ bdr_sequencer_shmem_startup(void)
     {
         /* initialize */
         memset(BdrSequencerCtl, 0, bdr_sequencer_shmem_size());
+       /*
+        * next_slot allows perdb workers to allocate seq slots.
+        * The sequencer will likely be separated into a different
+        * worker later.
+        */
+       BdrSequencerCtl->next_slot = 0;
     }
     LWLockRelease(AddinShmemInitLock);
  
@@ -450,6 +456,19 @@ bdr_sequencer_shmem_init(int nnodes, int sequencers)
     shmem_startup_hook = bdr_sequencer_shmem_startup;
  }
  
+/*
+ * The perdb worker doing sequencer setup needs to know what slot to
+ * allocate for the next sequencer.
+ *
+ * This should go away once the sequencer is separated into its own
+ * worker.
+ */
+int
+bdr_sequencer_get_next_free_slot(void)
+{
+   return BdrSequencerCtl->next_slot ++;
+}
+
  void
  bdr_sequencer_wakeup(void)
  {
@@ -509,6 +528,13 @@ bdr_schedule_eoxact_sequencer_wakeup(void)
     bdr_seq_pending_wakeup = true;
  }
  
+void
+bdr_sequencer_set_nnodes(Size nnodes)
+{
+   BdrSequencerSlot *slot = &BdrSequencerCtl->slots[seq_slot];
+   slot->nnodes = nnodes;
+}
+
  void
  bdr_sequencer_init(int new_seq_slot, Size nnodes)
  {
diff --git a/bdr_supervisor.c b/bdr_supervisor.c

new file mode 100644 (file)

index 0000000..4353c45
--- /dev/null
+++ b/bdr_supervisor.c
@@ -0,0 +1,441 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_supervisor.c
+ *     Cluster wide supervisor worker.
+ *
+ * Copyright (C) 2014-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     bdr_supervisor.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+#include "bdr_label.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "access/xact.h"
+
+#include "catalog/objectaddress.h"
+#include "catalog/pg_database.h"
+#include "catalog/pg_shseclabel.h"
+
+#include "commands/dbcommands.h"
+#include "commands/seclabel.h"
+
+#include "postmaster/bgworker.h"
+
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/ipc.h"
+
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/fmgroids.h"
+#include "utils/guc.h"
+
+/*
+ * Register a new perdb worker for the named database. The worker MUST
+ * not already exist.
+ *
+ * This is called by the supervisor during startup, and by user backends when
+ * the first connection is added for a database.
+ */
+static void
+bdr_register_perdb_worker(const char * dbname)
+{
+   BackgroundWorkerHandle *bgw_handle;
+   BackgroundWorker        bgw;
+   BdrWorker              *worker;
+   BdrPerdbWorker         *perdb;
+   unsigned int            worker_slot_number;
+   uint32                  worker_arg;
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   elog(DEBUG2, "Registering per-db worker for db %s", dbname);
+
+   worker = bdr_worker_shmem_alloc(
+               BDR_WORKER_PERDB,
+               &worker_slot_number
+           );
+
+   perdb = &worker->data.perdb;
+
+   strncpy(NameStr(perdb->dbname),
+           dbname, NAMEDATALEN);
+   NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
+   /* Nodecount is set when apply workers are registered */
+   perdb->nnodes = 0;
+#ifdef BUILDING_BDR
+   perdb->seq_slot = bdr_sequencer_get_next_free_slot();
+#endif
+
+   /*
+    * The rest of the perdb worker's shmem segment - proclatch
+    * and nnodes - gets set up by the worker during startup.
+    */
+
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 5;
+   bgw.bgw_notify_pid = 0;
+   snprintf(bgw.bgw_name, BGW_MAXLEN,
+            "bdr db: %s", dbname);
+
+   /*
+    * The main arg is composed of two uint16 parts - the worker
+    * generation number (see bdr_worker_shmem_startup) and the index into
+    * BdrWorkerCtl->slots in shared memory.
+    */
+   Assert(worker_slot_number <= UINT16_MAX);
+   worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_slot_number;
+   bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+   if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+   {
+       ereport(ERROR,
+               (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+                errmsg("Registering BDR worker failed, check prior log messages for details")));
+   }
+
+   elog(DEBUG2, "Registered per-db worker for %s successfully", dbname);
+}
+
+/*
+ * Check for BDR-enabled DBs and start per-db workers for any that currently
+ * lack them.
+ *
+ * TODO DYNCONF: Handle removal of BDR from DBs
+ */
+static void
+bdr_supervisor_rescan_dbs()
+{
+   Relation    secrel;
+   ScanKeyData skey[2];
+   SysScanDesc scan;
+   HeapTuple   secTuple;
+   int         n_new_workers = 0, bdr_dbs = 0;
+
+   elog(DEBUG1, "Supervisor scanning for BDR-enabled databases");
+
+   pgstat_report_activity(STATE_RUNNING, "scanning backends");
+
+   StartTransactionCommand();
+
+   /*
+    * Scan pg_seclabel looking for entries for pg_database with the bdr label
+    * provider. We'll find all labels for the BDR provider, irrespective
+    * of value.
+    *
+    * The only index present isn't much use for this scan and using it makes
+    * us set up more keys, so do a heap scan.
+    *
+    * The lock taken on pg_shseclabel must be strong enough to conflict with
+    * the lock taken be bdr.bdr_connection_add(...) to ensure that any
+    * transactions adding new labels have commited and cleaned up before we
+    * read it. Otherwise a race between the supervisor latch being set in a
+    * commit hook and the tuples actually becoming visible is possible.
+    */
+   secrel = heap_open(SharedSecLabelRelationId, RowShareLock);
+
+   ScanKeyInit(&skey[0],
+               Anum_pg_shseclabel_classoid,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(DatabaseRelationId));
+
+   ScanKeyInit(&skey[1],
+               Anum_pg_shseclabel_provider,
+               BTEqualStrategyNumber, F_TEXTEQ,
+               CStringGetTextDatum(BDR_SECLABEL_PROVIDER));
+
+   scan = systable_beginscan(secrel, InvalidOid, false, NULL, 2, &skey[0]);
+
+   /*
+    * We need to scan the shmem segment that tracks BDR workers and possibly
+    * modify it, so lock it.
+    *
+    * We have to take an exclusive lock in case we need to modify it,
+    * otherwise we'd be faced with a lock upgrade.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+   /*
+    * Now examine each label and if there's no worker for the labled
+    * DB already, start one.
+    */
+   while (HeapTupleIsValid(secTuple = systable_getnext(scan)))
+   {
+       FormData_pg_shseclabel *sec;
+       char                   *label_dbname;
+
+       sec = (FormData_pg_shseclabel*) GETSTRUCT(secTuple);
+
+       /*
+        * The per-db workers are mapped by name not oid, and that's necessary
+        * because the bgworker API requires that databases be identified by
+        * name.
+        *
+        * Look up the name of the DB with this OID and compare it. It's a bit slow,
+        * but we aren't doing this much.
+        *
+        * FIXME: Currently if a database is renamed, you'll have to restart
+        * PostgreSQL before BDR notices.
+        */
+       label_dbname = get_database_name(sec->objoid);
+
+       if (!bdr_is_bdr_activated_db(sec->objoid))
+       {
+           pfree(label_dbname);
+           continue;
+       }
+
+       elog(DEBUG1, "Found BDR-enabled database %s (oid=%i)",
+            label_dbname, sec->objoid);
+
+       bdr_dbs++;
+
+       /*
+        * Check if we have a per-db worker for this db oid already and if
+        * we don't, start one.
+        *
+        * This is O(n^2) for n BDR-enabled DBs; to be more scalable we could
+        * accumulate and sort the oids, then do a single scan of the shmem
+        * segment. But really, if you have that many DBs this cost is nothing.
+        */
+       if (find_perdb_worker_slot(sec->objoid, NULL) == -1)
+       {
+           /* No perdb worker exists for this DB, make one */
+           bdr_register_perdb_worker(label_dbname);
+           n_new_workers++;
+       } else {
+           elog(DEBUG2, "per-db worker for db %s already exists, not registering",
+                label_dbname);
+       }
+
+       pfree(label_dbname);
+   }
+
+   elog(DEBUG2, "Found %i BDR-labeled DBs; registered %i new per-db workers",
+        bdr_dbs, n_new_workers);
+
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   systable_endscan(scan);
+   heap_close(secrel, RowShareLock);
+
+   CommitTransactionCommand();
+
+   elog(DEBUG2, "Finished scanning for BDR-enabled databases");
+
+   pgstat_report_activity(STATE_IDLE, NULL);
+}
+
+/*
+ * Create the database the supervisor remains connected
+ * to, a DB with no user connections permitted.
+ *
+ * This is a workaorund for the inability to use pg_shseclabel
+ * without a DB connection; see comments in bdr_supervisor_main
+ */
+static void
+bdr_supervisor_createdb()
+{
+   Oid dboid;
+
+   StartTransactionCommand();
+
+   /* If the DB already exists, no need to create it */
+   dboid = get_database_oid("bdr", true);
+
+   if (dboid == InvalidOid)
+   {
+       CreatedbStmt stmt;
+       DefElem de_template;
+       DefElem de_connlimit;
+
+       de_template.defname = "template";
+       de_template.type = T_String;
+       de_template.arg = (Node*) makeString("template1");
+
+       de_connlimit.defname = "connectionlimit";
+       de_template.type = T_Integer;
+       de_connlimit.arg = (Node*) makeInteger(1);
+
+       stmt.dbname = "bdr";
+       stmt.options = list_make2(&de_template, &de_connlimit);
+
+       dboid = createdb(&stmt);
+
+       if (dboid == InvalidOid)
+           elog(ERROR, "Failed to create 'bdr' DB");
+
+       /* TODO DYNCONF: Add a comment to the db, and/or a dummy table */
+
+       elog(LOG, "Created database 'bdr' (oid=%i) during BDR startup", dboid);
+   }
+   else
+   {
+       elog(DEBUG3, "Database 'bdr' (oid=%i) already exists, not creating", dboid);
+   }
+
+   CommitTransactionCommand();
+
+   Assert(dboid != InvalidOid);
+}
+
+
+/*
+ * The BDR supervisor is a static bgworker that serves as the master/supervisor
+ * for all BDR workers. It exists so that BDR can be enabled and disabled
+ * dynamically for databases.
+ *
+ * It is responsible for identifying BDR-enabled databases at startup and
+ * launching their dynamic per-db workers. It should do as little else as
+ * possible, as it'll run when BDR is in shared_preload_libraries whether
+ * or not it's otherwise actually in use.
+ *
+ * The supervisor worker has no access to any database.
+ */
+void
+bdr_supervisor_worker_main(Datum main_arg)
+{
+   Assert(DatumGetInt32(main_arg) == 0);
+   Assert(IsBackgroundWorker);
+
+   pqsignal(SIGHUP, bdr_sighup);
+   pqsignal(SIGTERM, bdr_sigterm);
+   BackgroundWorkerUnblockSignals();
+
+   /*
+    * Unfortunately we currently can't access shared catalogs like
+    * pg_shseclabel (where we store information about which database use bdr)
+    * without being connected to a database. Only shared & nailed catalogs
+    * can be accessed before being connected to a database - and
+    * pg_shseclabel is not one of those.
+    *
+    * Instead we have a database "bdr" that's supposed to be empty which we
+    * just use to read pg_shseclabel. Not pretty, but it works.
+    *
+    * Without copying significant parts of InitPostgres() we can't even read
+    * pg_database without connecting to a database.  As we can't connect to
+    * "no database", we must connect to one that always exists, like
+    * template1, then use it to create a dummy database to operate in.
+    *
+    * Once created we set a shmem flag and restart so we know we can connect
+    * to the newly created database.
+    */
+   if (!BdrWorkerCtl->is_supervisor_restart)
+   {
+       BackgroundWorkerInitializeConnection("template1", NULL);
+       bdr_supervisor_createdb();
+
+       BdrWorkerCtl->is_supervisor_restart = true;
+
+       elog(DEBUG1, "BDR supervisor restarting to connect to 'bdr' DB");
+       proc_exit(1);
+   }
+
+   BackgroundWorkerInitializeConnection("bdr", NULL);
+
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+   BdrWorkerCtl->supervisor_latch = &MyProc->procLatch;
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   elog(DEBUG1, "BDR supervisor connected to DB 'bdr'");
+
+   SetConfigOption("application_name", "bdr supervisor", PGC_USERSET, PGC_S_SESSION);
+
+   /* mark as idle, before starting to loop */
+   pgstat_report_activity(STATE_IDLE, NULL);
+
+   bdr_supervisor_rescan_dbs();
+
+   while (!got_SIGTERM)
+   {
+       int rc;
+
+       /*
+        * After startup the supervisor doesn't currently have anything to do,
+        * so it can just go to sleep on its latch. It could exit after running
+        * startup, but we're expecting to need it to do other things down the
+        * track, so might as well keep it alive...
+        */
+       rc = WaitLatch(&MyProc->procLatch,
+                      WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                      180000L);
+
+       ResetLatch(&MyProc->procLatch);
+
+       /* emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       if (got_SIGHUP)
+       {
+           got_SIGHUP = false;
+           ProcessConfigFile(PGC_SIGHUP);
+       }
+
+       if (rc & WL_LATCH_SET)
+       {
+           /*
+            * We've been asked to launch new perdb workers if there are any
+            * changes to security labels.
+            */
+           bdr_supervisor_rescan_dbs();
+       }
+   }
+
+   proc_exit(0);
+}
+
+/*
+ * Register the BDR supervisor bgworker, which will start all the
+ * per-db workers.
+ *
+ * Called in postmaster context from _PG_init.
+ *
+ * The supervisor is guaranteed to be assigned the first shmem slot in our
+ * workers shmem array. This is vital because at this point shemem isn't
+ * allocated yet, so all we can do is tell the supervisor worker its shmem slot
+ * number then actually populate that slot when the postmaster runs our shmem
+ * init callback later.
+ */
+void
+bdr_supervisor_register()
+{
+   BackgroundWorker bgw;
+
+   Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
+
+   /*
+    * The supervisor worker accesses shared relations, but does not connect to
+    * any specific database. We still have to flag it as using a connection in
+    * the bgworker API.
+    */
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_supervisor_worker_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 1;
+   bgw.bgw_notify_pid = 0;
+   snprintf(bgw.bgw_name, BGW_MAXLEN,
+            "bdr supervisor");
+   bgw.bgw_main_arg = Int32GetDatum(0); /* unused */
+
+   RegisterBackgroundWorker(&bgw);
+}
diff --git a/bdr_upgrade.c b/bdr_upgrade.c

new file mode 100644 (file)

index 0000000..cb39688
--- /dev/null
+++ b/bdr_upgrade.c
@@ -0,0 +1,639 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_upgrade.c
+ *     Support for upgrading between BDR versions
+ *
+ * Copyright (C) 2012-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     bdr_upgrade.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+#include "libpq-fe.h"
+#include "miscadmin.h"
+
+#include "libpq/pqformat.h"
+
+#include "catalog/pg_type.h"
+
+#include "storage/ipc.h"
+
+PGDLLEXPORT Datum bdr_upgrade_to_090(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(bdr_upgrade_to_090);
+
+static void
+bdr_upgrade_to_090_insert_connection( PGconn *conn,
+       const char *local_sysid, const char *local_timeline,
+       const char *local_dboid, const char *my_conninfo)
+{
+   PGresult        *res;
+   const char      *values[8];
+   Oid             types[8] =
+       { TEXTOID, OIDOID, OIDOID, TEXTOID, OIDOID, OIDOID, BOOLOID, TEXTOID };
+
+   values[0] = local_sysid;
+   values[1] = local_timeline;
+   values[2] = local_dboid;
+   values[3] = "0";
+   values[4] = "0";
+   values[5] = "0";
+   values[6] = "f";
+   values[7] = &my_conninfo[0];
+   /* TODO: replication sets too! */
+
+   res = PQexecParams(conn, "INSERT INTO bdr.bdr_connections\n"
+                            "(conn_sysid, conn_timeline, conn_dboid,\n"
+                            " conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,\n"
+                            " conn_is_unidirectional, conn_dsn)\n"
+                            "VALUES ($1,$2,$3,$4,$5,$6,$7,$8)",
+                      8, types, values, NULL, NULL, false);
+
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       elog(ERROR, "inserting local info into bdr_connections failed with %s: %s\n",
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+   }
+
+   PQclear(res);
+}
+
+/*
+ * Utility function for upgrading a BDR node running 0.8.0 or older to 0.9.0
+ * (dynamic configuration).
+ *
+ * This function is only used for the 2nd and subsequent nodes. It is not
+ * required or useful for upgrading the first node.
+ *
+ * This does some sanity checks to ensure the local node isn't already joined
+ * and that the remote node is actually a known peer with a bdr_nodes entry.
+ *
+ * It then copies the remote end's bdr_connections entries to the local node so
+ * the local node knows which peers to connect to. It inserts a copy of the
+ * local node's bdr_connections entry in the remote and tells the local and
+ * remote nodes to refresh their worker lists.
+ *
+ * This is one long function because it's one-shot code. It's written in C
+ * so it can re-use libpq connections across multiple steps, doing everything
+ * in one transaction.
+ */
+Datum
+bdr_upgrade_to_090(PG_FUNCTION_ARGS)
+{
+   const char  *my_conninfo = PG_GETARG_CSTRING(0);
+   const char  *remote_conninfo;
+   const char  *my_local_conninfo = NULL;
+   PGconn      *local_conn = NULL;
+   const char  *local_dsn;
+
+   char        local_sysid_str[33];
+   char        local_timeline_str[33];
+   char        local_dboid_str[33];
+
+   stringify_my_node_identity(local_sysid_str, sizeof(local_sysid_str),
+                              local_timeline_str, sizeof(local_timeline_str),
+                              local_dboid_str, sizeof(local_dboid_str));
+
+   if (!PG_ARGISNULL(1))
+   {
+       my_local_conninfo = PG_GETARG_CSTRING(1);
+       local_dsn = my_local_conninfo;
+   }
+   else
+   {
+       local_dsn = my_conninfo;
+   }
+
+   if (PG_ARGISNULL(2))
+   {
+       elog(NOTICE, "upgrading the first node of a BDR group (remote_conninfo was null)");
+       remote_conninfo = NULL;
+   }
+   else
+   {
+       elog(NOTICE, "upgrading the local node by connecting to an already upgraded peer node");
+       remote_conninfo = PG_GETARG_CSTRING(2);
+   }
+
+   /*
+    * Connect to the local node in non-replication mode.
+    *
+    * We'll use this connection to COPY pg_connections data, instead of having
+    * to mess around constructing and deconstructing pg_connections tuples. It
+    * also lets us commit autonomously.
+    */
+   local_conn = PQconnectdb(local_dsn);
+
+   if (PQstatus(local_conn) != CONNECTION_OK)
+   {
+       ereport(ERROR,
+               (errmsg("connection to supplied local dsn '%s' failed", local_dsn),
+                errdetail("Connection failed with %s", PQerrorMessage(local_conn))));
+   }
+
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+   {
+       PGconn *remote_conn = NULL;
+       PGresult *res;
+       remote_node_info    ri, li, li_via_remote;
+       Oid         nodeid_types[3] = { TEXTOID, OIDOID, OIDOID };
+       const char  *local_nodeid_values[3];
+
+       const char * const bdr_nodes_query =
+           "SELECT 1 FROM bdr.bdr_nodes "
+           "WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3";
+
+       const char * const setup_query =
+           "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+           "SET search_path = bdr, pg_catalog;\n"
+           "SET bdr.permit_unsafe_ddl_commands = on;\n"
+           "SET bdr.skip_ddl_replication = on;\n"
+           "SET bdr.skip_ddl_locking = on;\n"
+           "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+           "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+       local_nodeid_values[0] = &local_sysid_str[0];
+       local_nodeid_values[1] = &local_timeline_str[0];
+       local_nodeid_values[2] = &local_dboid_str[0];
+
+       res = PQexec(local_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on local failed: %s",
+                   PQresultErrorMessage(res));
+
+       PQclear(res);
+
+       /*
+        * Check that the local connection supplied is usable, and that the
+        * node identity of the endpoint matches the node we're being called
+        * in.
+        *
+        * This will test the local-only remote_conn if supplied, otherwise the
+        * my-dsn remote_conn. Whichever one we're using for the init process.
+        * (There's no guarantee that my-dsn is even valid from the perspective
+        * of the local node if a local_dsn was also supplied).
+        *
+        * Replication mode isn't tested here. We'll ask the peer to
+        * connect back to us later instead.
+        */
+       bdr_get_remote_nodeinfo_internal(local_conn, &li);
+
+       if (!(li.sysid == GetSystemIdentifier()
+           && li.timeline == ThisTimeLineID
+           && li.dboid == MyDatabaseId))
+       {
+           ereport(ERROR,
+                   (errmsg("local dsn %s must point to the local node", local_dsn),
+                    errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+                        GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+                        li.sysid, li.timeline, li.dboid)));
+       }
+
+       if (!li.is_superuser)
+           elog(ERROR, "local connection '%s' must have superuser rights", local_dsn);
+
+       {
+           /*
+            * Check for ourselves in local bdr_nodes by UPDATEing our local
+            * bdr_nodes entry. This will get propagated to the remote end later.
+            *
+            * These values could already be set if a prior upgrade attempt failed
+            * after a local commit and before the remote commit.
+            */
+           const char *    node_status;
+           const char *    bdr_nodes_update_values[5];
+
+           Oid             bdr_nodes_update_types[5] =
+               { TEXTOID, OIDOID, OIDOID, TEXTOID, TEXTOID };
+
+
+           bdr_nodes_update_values[0] = &local_sysid_str[0];
+           bdr_nodes_update_values[1] = &local_timeline_str[0];
+           bdr_nodes_update_values[2] = &local_dboid_str[0];
+
+           if (local_dsn != NULL)
+               bdr_nodes_update_values[3] = local_dsn;
+           else
+               bdr_nodes_update_values[3] = NULL;
+
+           if (remote_conninfo != NULL)
+               bdr_nodes_update_values[4] = remote_conninfo;
+           else
+               bdr_nodes_update_values[4] = NULL;
+
+           res = PQexecParams(local_conn,
+                              "UPDATE bdr.bdr_nodes "
+                              "SET node_local_dsn = $4, "
+                              "    node_init_from_dsn = $5 "
+                              "WHERE node_sysid = $1 "
+                              "  AND node_timeline = $2 "
+                              "  AND node_dboid = $3"
+                              "RETURNING node_status",
+                              5, bdr_nodes_update_types, bdr_nodes_update_values,
+                              NULL, NULL, 0);
+
+           if (PQresultStatus(res) != PGRES_TUPLES_OK)
+           {
+               elog(ERROR, "updating local bdr_nodes failed: state %s: %s\n",
+                    PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+           }
+
+           if (PQntuples(res) != 1)
+           {
+               ereport(ERROR,
+                       (errmsg("no entry for local node found in bdr.bdr_nodes"),
+                        errdetail("Expected (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but no such row found in bdr_nodes",
+                            GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId)));
+           }
+
+           node_status = PQgetvalue(res, 0, 0);
+
+           if (strcmp(node_status, "r") != 0)
+           {
+               ereport(ERROR,
+                       (errmsg("bdr_nodes entry for local node has status != 'r'"),
+                        errdetail("Row with (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but status = '%s' not expected 'r'",
+                            GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, node_status)));
+           }
+
+       }
+
+       /*
+        * Another sanity check: Local bdr_connections must be empty.
+        *
+        * If it isn't then a prior upgrade failed after the local commit
+        * but before the remote commit. The local bdr_connections must be
+        * deleted with replication disabled to prevent the deletion
+        * from being enqueued on the outbound slots. This is done
+        * manually by the user per the docs.
+        */
+       res = PQexec(local_conn, "SELECT 1 FROM bdr.bdr_connections");
+
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+           elog(ERROR, "querying local bdr_connections failed: state %s: %s\n",
+                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       }
+
+       if (PQntuples(res) > 0)
+       {
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("the local node's bdr.bdr_connections is not empty"),
+                    errdetail("No connections from the local node to other nodes may exist when upgrading"),
+                    errhint("If a prior upgrade attempt failed see the documentation for recovery steps")));
+       }
+
+       PQclear(res);
+
+       /*
+        * BDR requires a security label to be set on the database in order
+        * to start up.
+        */
+       res = PQexec(local_conn, "SELECT bdr.internal_update_seclabel()");
+
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+           elog(ERROR, "setting local bdr security label failed: state %s: %s\n",
+                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       }
+
+       PQclear(res);
+
+
+       /*
+        * If this is the first node, insert an entry for ourselves into
+        * the local bdr_connections. We can't insert into the remote and
+        * have it replicate because there is no remote.
+        */
+       if (remote_conninfo == NULL)
+       {
+           bdr_upgrade_to_090_insert_connection(local_conn, local_sysid_str,
+                   local_timeline_str, local_dboid_str, my_conninfo);
+       }
+
+       /*
+        * Establish the connection we'll use to copy the bdr_connections
+        * entries we need and insert our own bdr_connections entry
+        * into the remote end.
+        */
+       if (remote_conninfo != NULL)
+       {
+           StringInfoData  dsn;
+
+           initStringInfo(&dsn);
+           appendStringInfo(&dsn,
+                           "%s fallback_application_name='"BDR_LOCALID_FORMAT":init'",
+                           remote_conninfo, BDR_LOCALID_FORMAT_ARGS);
+           /*
+            * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
+            * system identifier. If there is, that'll tell us what stage of startup
+            * we are up to and let us resume an incomplete start.
+            */
+           remote_conn = PQconnectdb(dsn.data);
+           if (PQstatus(remote_conn) != CONNECTION_OK)
+           {
+               ereport(FATAL,
+                       (errmsg("could not connect to the server in non-replication mode: %s",
+                               PQerrorMessage(remote_conn)),
+                        errdetail("dsn was: %s", dsn.data)));
+           }
+       }
+
+       PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&remote_conn));
+       {
+
+           char        remote_sysid_str[33];
+           char        remote_timeline_str[33];
+           char        remote_dboid_str[33];
+           const char *remote_nodeid_values[3];
+
+           if (remote_conn != NULL)
+           {
+               res = PQexec(remote_conn, setup_query);
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "BEGIN or table locking on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               /*
+                * Obtain the remote node's identity so we can look it up in the local
+                * bdr_nodes and see if we recognise this node. This will also ensure
+                * BDR is installed on the remote.
+                */
+               bdr_get_remote_nodeinfo_internal(remote_conn, &ri);
+
+               if (ri.sysid == GetSystemIdentifier()
+                   && ri.timeline == ThisTimeLineID
+                   && ri.dboid == MyDatabaseId)
+               {
+                   bdr_error_nodeids_must_differ(ri.sysid, ri.timeline, ri.dboid);
+               }
+
+               if (ri.version_num != BDR_VERSION_NUM)
+                   elog(ERROR, "remote end must run BDR version %s but is running %s",
+                        BDR_VERSION, ri.version);
+
+               if (!ri.is_superuser)
+                   elog(ERROR, "connection must have superuser rights");
+
+               if (strcmp(ri.variant, "BDR") != 0)
+                   elog(ERROR, "remote node must be running full BDR, not variant %s",
+                           ri.variant);
+
+               /*
+                * As a further sanity check, make sure the remote node can connect back
+                * to the local node, and that the resulting IDs match.
+                */
+               bdr_test_remote_connectback_internal(remote_conn, &li_via_remote, my_conninfo);
+
+               if (!(li_via_remote.sysid == GetSystemIdentifier()
+                   && li_via_remote.timeline == ThisTimeLineID
+                   && li_via_remote.dboid == MyDatabaseId))
+               {
+                   ereport(ERROR,
+                           (errmsg("remote node can connect to dsn %s but it doesn't match the local node identity", my_conninfo),
+                            errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+                                GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+                                li_via_remote.sysid, li_via_remote.timeline, li_via_remote.dboid)));
+               }
+
+               if (!li_via_remote.is_superuser)
+                   elog(ERROR, "connection from remote node to local node using dsn '%s' must have superuser rights", my_conninfo);
+
+               /*
+                * The basics look sane. Check to see if the target node is present
+                * in the local bdr_nodes. If it isn't then we can't join it with
+                * an upgrade, because it's not an existing peer.
+                */
+
+               stringify_node_identity(remote_sysid_str, sizeof(remote_sysid_str),
+                                       remote_timeline_str, sizeof(remote_timeline_str),
+                                       remote_dboid_str, sizeof(remote_dboid_str),
+                                       ri.sysid, ri.timeline, ri.dboid);
+
+               remote_nodeid_values[0] = &remote_sysid_str[0];
+               remote_nodeid_values[1] = &remote_timeline_str[0];
+               remote_nodeid_values[2] = &remote_dboid_str[0];
+
+               res = PQexecParams(local_conn, bdr_nodes_query, 3, nodeid_types, remote_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying local bdr_nodes for remote nodeid failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) == 0)
+               {
+                   /* Looks like we didn't find the expected node entry */
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The remote node identified by the passed remote connection string is not known locally"),
+                            errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the local bdr.bdr_nodes",
+                                ri.sysid, ri.timeline, ri.dboid),
+                            errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+               }
+
+               Assert(PQntuples(res) == 1);
+
+               PQclear(res);
+
+               /*
+                * Now ensure that our node is known to the remote end
+                */
+               res = PQexecParams(remote_conn, bdr_nodes_query, 3, nodeid_types,
+                                  local_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_nodes for local nodeid failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) == 0)
+               {
+                   /*
+                    * We're not known to the remote node so we can't do an upgrade
+                    * join to it.
+                    */
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string does not recognise the local node"),
+                            errdetail("The local node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the remote bdr.bdr_nodes",
+                                GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId),
+                            errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+               }
+
+               Assert(PQntuples(res) == 1);
+
+               PQclear(res);
+
+               /*
+                * We now know there's a bdr_nodes entry on each end. Ensure that the
+                * remote end contains at least a bdr_connections entry for its self
+                * and does NOT contain a connection for us.
+                */
+               res = PQexec(remote_conn,
+                            "SELECT 1 "
+                            "FROM bdr.bdr_connections c, "
+                            "     bdr.bdr_get_local_nodeid() l "
+                            "WHERE c.conn_sysid = l.sysid "
+                            "  AND c.conn_timeline = l.timeline "
+                            "  AND c.conn_dboid = l.dboid "
+                            );
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) != 1)
+               {
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string does not yet have a connection entry for its own node"),
+                            errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (conn_sysid,conn_timeline,conn_dboid) is present in the local bdr.bdr_connections",
+                                ri.sysid, ri.timeline, ri.dboid),
+                            errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+               }
+
+               PQclear(res);
+
+               res = PQexecParams(remote_conn,
+                                  "SELECT 1 "
+                                  "FROM bdr.bdr_connections c "
+                                  "WHERE c.conn_sysid = $1 "
+                                  "  AND c.conn_timeline = $2 "
+                                  "  AND c.conn_dboid = $3 ",
+                                  3, nodeid_types, local_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) != 0)
+               {
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string already has a connection string for the local node"),
+                            errdetail("The local node's identity (conn_sysid="UINT64_FORMAT",conn_timeline=%u,conn_dboid=%u) already has an entry in the remote bdr.bdr_connections",
+                                li.sysid, li.timeline, ri.dboid),
+                            errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+               }
+
+               PQclear(res);
+
+               /*
+                * Alright, time to actually perform the upgrade.
+                *
+                * We need to:
+                *
+                * - Copy remote bdr_connections entries to the local node
+                *
+                * - Upsert a row for the local node in the remote's
+                *   bdr_connections
+                *
+                * - Register an on commit hook on the remote to rescan
+                *   bdr_connections.
+                *
+                * - Register an on commit hook on the local side to rescan
+                *   bdr_connections
+                *
+                * - set the local security label
+                *
+                * - Commit the remote transaction, adding the bdr_connections
+                *   row
+                *
+                * - Return, allowing a commit to occur to save the local
+                *   bdr_connections entries.
+                */
+
+               bdr_copytable(remote_conn, local_conn,
+                       "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+                       "COPY bdr.bdr_connections FROM stdin");
+
+               /*
+                * Time to insert connection info about us into the remote node and ask it
+                * to connect back to us, then tell the other nodes. We don't update
+                * the remote's bdr_nodes entry for us, as the change we applied locally
+                * will get replicated.
+                *
+                * Since we have a remote conn we didn't insert our
+                * bdr_connections entry locally above. Insert it into the
+                * remote node now instead. It'll replicate back to the local
+                * node when we connect to the upstream.
+                */
+               bdr_upgrade_to_090_insert_connection(remote_conn, local_sysid_str,
+                       local_timeline_str, local_dboid_str, my_conninfo);
+
+               res = PQexec(remote_conn, "SELECT bdr.bdr_connections_changed()");
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+                   elog(ERROR, "SELECT bdr.bdr_connections_changed() on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               res = PQexec(remote_conn, "INSERT INTO bdr.bdr_queued_commands\n"
+                                         "(lsn, queued_at, perpetrator, command_tag, command)\n"
+                                         "VALUES (pg_current_xlog_insert_location(), current_timestamp,\n"
+                                         "        current_user, 'SELECT',\n"
+                                         "       'SELECT bdr.bdr_connections_changed()');");
+
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "enqueuing bdr.bdr_connections_changed() in the ddl rep queue failed: %s",
+                           PQresultErrorMessage(res));
+           }
+
+           res = PQexec(local_conn, "SELECT bdr.bdr_connections_changed()");
+           if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               elog(ERROR, "SELECT bdr.bdr_connections_changed() on local failed: %s",
+                       PQresultErrorMessage(res));
+
+           PQclear(res);
+
+           res = PQexec(local_conn, "COMMIT");
+           if (PQresultStatus(res) != PGRES_COMMAND_OK)
+               elog(ERROR, "COMMIT on remote failed: %s",
+                       PQresultErrorMessage(res));
+
+           PQclear(res);
+
+           if (remote_conn != NULL)
+           {
+               res = PQexec(remote_conn, "COMMIT");
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "COMMIT on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               free_remote_node_info(&ri);
+           }
+
+           free_remote_node_info(&li);
+       }
+       PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&remote_conn));
+
+       PQfinish(remote_conn);
+
+   }
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+
+   PQfinish(local_conn);
+
+   PG_RETURN_VOID();
+}
diff --git a/expected/ddl/create.out b/expected/ddl/create.out

index b574fbd9ada520fc51e39b4baeaf14e6543089f5..6aa5d912c6ce3891d72be63bbff0477361701bc0 100644 (file)
--- a/expected/ddl/create.out
+++ b/expected/ddl/create.out
@@ -774,6 +774,13 @@ ERROR:  Tables WITH OIDs are not supported with bdr
  CREATE TABLE tbl_without_oids() WITHOUT oids;
  DROP TABLE tbl_without_oids;
  SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
+ pg_xlog_wait_remote_apply 
+---------------------------
+ 
+ 
+(2 rows)
+
  --- AGGREGATE ---
  \c postgres
  CREATE AGGREGATE test_avg (
diff --git a/expected/init.out b/expected/init.out

index 4dd05289da9cfbca97ca7c4cd8d194b73bef096e..0e0cd804c62e3b8249c0ccd6e6d55d095a7d975a 100644 (file)
--- a/expected/init.out
+++ b/expected/init.out
@@ -9,45 +9,9 @@ CREATE USER super SUPERUSER;
  GRANT ALL ON SCHEMA public TO nonsuper;
  \c regression
  GRANT ALL ON SCHEMA public TO nonsuper;
-SELECT pg_sleep(10);
- pg_sleep 
-----------
- 
-(1 row)
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
-   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
-   IF FOUND THEN
-       RETURN;
-   END IF;
-
-   PERFORM bdr.bdr_replicate_ddl_command($DDL$
-       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
-       AS $FUNC$
-       BEGIN
-           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
-               PERFORM pg_sleep(0.01);
-           END LOOP;
-       END;$FUNC$ LANGUAGE plpgsql;
-   $DDL$);
-END;$DO$;
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
-    OUT readdb1 text,
-    OUT readdb2 text,
-    OUT writedb1 text,
-    OUT writedb2 text
-    ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
-    current_setting('bdrtest.readdb1'),
-    current_setting('bdrtest.readdb2'),
-    current_setting('bdrtest.writedb1'),
-    current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
- bdr_replicate_ddl_command 
----------------------------
- 
-(1 row)
-
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
diff --git a/expected/init_bdr.out b/expected/init_bdr.out

new file mode 100644 (file)

index 0000000..9b1dbe4
--- /dev/null
+++ b/expected/init_bdr.out
@@ -0,0 +1,97 @@
+\c postgres
+SELECT bdr.bdr_group_create(
+   dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-1']
+   );
+ bdr_group_create 
+------------------
+ 
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+ 
+(1 row)
+
+\c regression
+SELECT bdr.bdr_group_join(
+   dsn := 'dbname=regression',
+   init_from_dsn := 'dbname=postgres',
+   local_dsn := 'dbname=regression',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+ bdr_group_join 
+----------------
+ 
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+ 
+(1 row)
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type |  database  | active 
+--------+-----------+------------+--------
+ bdr    | logical   | postgres   | t
+ bdr    | logical   | regression | t
+(2 rows)
+
+SELECT count(*) FROM pg_stat_replication;
+ count 
+-------
+     2
+(1 row)
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres   | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status |  node_local_dsn   | node_init_from_dsn 
+-------------+-------------------+--------------------
+ r           | dbname=postgres   | 
+ r           | dbname=regression | dbname=postgres
+(2 rows)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres   | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status |  node_local_dsn   | node_init_from_dsn 
+-------------+-------------------+--------------------
+ r           | dbname=postgres   | 
+ r           | dbname=regression | dbname=postgres
+(2 rows)
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command 
+---------------------------
+ 
+(1 row)
+
diff --git a/expected/init_udr.out b/expected/init_udr.out

new file mode 100644 (file)

index 0000000..6740cb2
--- /dev/null
+++ b/expected/init_udr.out
@@ -0,0 +1,89 @@
+\c postgres
+SELECT bdr.bdr_subscribe(
+   remote_dsn := 'dbname=regression',
+   local_dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+ bdr_subscribe 
+---------------
+ 
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+ 
+(1 row)
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type |  database  | active 
+--------+-----------+------------+--------
+ bdr    | logical   | regression | t
+(1 row)
+
+SELECT count(*) FROM pg_stat_replication;
+ count 
+-------
+     1
+(1 row)
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(1 row)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn  | node_init_from_dsn 
+-------------+-----------------+--------------------
+ r           | dbname=postgres | dbname=regression
+(1 row)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+ conn_dsn | conn_replication_sets 
+----------+-----------------------
+(0 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn | node_init_from_dsn 
+-------------+----------------+--------------------
+(0 rows)
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+   IF FOUND THEN
+       RETURN;
+   END IF;
+
+   PERFORM bdr.bdr_replicate_ddl_command($DDL$
+       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+       AS $FUNC$
+       BEGIN
+           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+               PERFORM pg_sleep(0.01);
+           END LOOP;
+       END;$FUNC$ LANGUAGE plpgsql;
+   $DDL$);
+END;$DO$;
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command 
+---------------------------
+ 
+(1 row)
+
diff --git a/expected/isolation/init.out b/expected/isolation/init.out

new file mode 100644 (file)

index 0000000..aa62f39
--- /dev/null
+++ b/expected/isolation/init.out
@@ -0,0 +1,137 @@
+Parsed test spec with 3 sessions
+
+starting permutation: setup1 setup2 setup3 join_root join_2 wait_join_2 check_join_2 join_3 wait_join_3 check_join_3 wait
+step setup1: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step setup2: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step setup3: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step join_root: 
+   SELECT bdr.bdr_group_create(
+       dsn := 'dbname=node1'
+       );
+
+bdr_group_create
+
+               
+step join_2: 
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node2',
+       init_from_dsn := 'dbname=node1'
+       );
+
+bdr_group_join 
+
+               
+step wait_join_2: 
+   SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+               
+step check_join_2: 
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+               
+plugin         slot_type      database       active         
+
+bdr            logical        node1          t              
+bdr            logical        node2          t              
+count          
+
+2              
+conn_dsn       conn_replication_sets
+
+dbname=node1   {default}      
+dbname=node2   {default}      
+node_status    node_local_dsn node_init_from_dsn
+
+r              dbname=node1                  
+r              dbname=node2   dbname=node1   
+step join_3: 
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node3',
+       init_from_dsn := 'dbname=node1',
+       local_dsn := 'dbname=node3'
+       );
+
+bdr_group_join 
+
+               
+step wait_join_3: 
+   SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+               
+step check_join_3: 
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+               
+plugin         slot_type      database       active         
+
+bdr            logical        node1          t              
+bdr            logical        node1          t              
+bdr            logical        node2          t              
+bdr            logical        node2          t              
+bdr            logical        node3          t              
+bdr            logical        node3          t              
+count          
+
+6              
+conn_dsn       conn_replication_sets
+
+dbname=node1   {default}      
+dbname=node2   {default}      
+dbname=node3   {default}      
+node_status    node_local_dsn node_init_from_dsn
+
+r              dbname=node1                  
+r              dbname=node2   dbname=node1   
+r              dbname=node3   dbname=node1   
+step wait: 
+   -- pg_xlog_wait_remote_apply isn't good enough alone
+   -- as it doesn't permit us to say how many nodes must be present.
+   -- It'll succeed if there are zero nodes. So we first have to wait
+   -- for enough replication connections.
+   DO $$
+   DECLARE
+       nodecount integer := 0;
+       target_lsn pg_lsn;
+   BEGIN
+       WHILE nodecount <> 6
+       LOOP
+           PERFORM pg_sleep(1);
+           PERFORM pg_stat_clear_snapshot();
+           -- Now find out how many walsenders are running
+           nodecount := (SELECT count(*)
+                         FROM pg_catalog.pg_stat_replication);
+           RAISE NOTICE 'Found % nodes',nodecount;
+       END LOOP;
+       -- OK, all nodes seen, now we wait for catchup on them all.
+       target_lsn := pg_current_xlog_location();
+       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+       RAISE NOTICE 'Catchup to LSN completed';
+   END;
+   $$;
+
diff --git a/expected/isolation/waitforstart.out b/expected/isolation/waitforstart.out

deleted file mode 100644 (file)

index 8fcfe02..0000000
--- a/expected/isolation/waitforstart.out
+++ /dev/null
@@ -1,26 +0,0 @@
-Parsed test spec with 1 sessions
-
-starting permutation: wait
-step wait: 
-   DO $$
-   DECLARE
-       nodecount integer := 0;
-       target_lsn pg_lsn;
-   BEGIN
-       WHILE nodecount <> 6
-       LOOP
-           PERFORM pg_sleep(1);
-           PERFORM pg_stat_clear_snapshot();
-           -- Now find out how many walsenders are running
-           nodecount := (SELECT count(*)
-                         FROM pg_catalog.pg_stat_replication);
-           RAISE NOTICE 'Found % nodes',nodecount;
-       END LOOP;
-       -- OK, all nodes seen, now we wait for catchup on them all.
-       target_lsn := pg_current_xlog_location();
-       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
-       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
-       RAISE NOTICE 'Catchup to LSN completed';
-   END;
-   $$;
-
diff --git a/expected/upgrade.out b/expected/upgrade.out

index 1478090345dc7910010daf37da5723ba67c46ee4..7f1fb1a35fab4d3d33a606ee5dc0f4ac96950da6 100644 (file)
--- a/expected/upgrade.out
+++ b/expected/upgrade.out
@@ -25,6 +25,8 @@ CREATE EXTENSION bdr VERSION '0.8.0.7';
  DROP EXTENSION bdr;
  CREATE EXTENSION bdr VERSION '0.9.0.0';
  DROP EXTENSION bdr;
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
  -- evolve version one by one from the oldest to the newest one
  CREATE EXTENSION bdr VERSION '0.8.0';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
@@ -35,8 +37,9 @@ ALTER EXTENSION bdr UPDATE TO '0.8.0.5';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
  ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
  -- Should never have to do anything: You missed adding the new version above.
  ALTER EXTENSION bdr UPDATE;
-NOTICE:  version "0.9.0.0" of extension "bdr" is already installed
+NOTICE:  version "0.9.0.1" of extension "bdr" is already installed
  \c postgres
  DROP DATABASE extension_upgrade;
diff --git a/extsql/bdr--0.8.0.sql b/extsql/bdr--0.8.0.sql

index 0560a8ed16ccef588486bfb99903f3dc176ff9f1..6880da348bcc818d463bc3e015a62a66d2e02606 100644 (file)
--- a/extsql/bdr--0.8.0.sql
+++ b/extsql/bdr--0.8.0.sql
@@ -359,6 +359,9 @@ COMMENT ON COLUMN bdr_nodes.node_timeline IS 'timeline ID of this node';
  COMMENT ON COLUMN bdr_nodes.node_dboid IS 'local database oid on the cluster (node_sysid, node_timeline)';
  COMMENT ON COLUMN bdr_nodes.node_status IS 'Readiness of the node: [i]nitializing, [c]atchup, [r]eady. Doesn''t indicate connected/disconnected.';
  
+-- We don't exclude bdr_nodes with pg_extension_config_dump
+-- because this is a global table that's sync'd between nodes.
+
  CREATE TABLE bdr_global_locks(
      locktype text NOT NULL,
  
diff --git a/extsql/bdr--0.9.0.0--0.9.0.1.sql b/extsql/bdr--0.9.0.0--0.9.0.1.sql

new file mode 100644 (file)

index 0000000..98d9ef5
--- /dev/null
+++ b/extsql/bdr--0.9.0.0--0.9.0.1.sql
@@ -0,0 +1,584 @@
+-- Data structures for BDR's dynamic configuration management
+
+SET LOCAL search_path = bdr;
+SET bdr.permit_unsafe_ddl_commands = true;
+SET bdr.skip_ddl_replication = true;
+
+ALTER TABLE bdr.bdr_nodes
+  ADD COLUMN node_local_dsn text,
+  ADD COLUMN node_init_from_dsn text;
+
+ALTER TABLE bdr.bdr_nodes
+  DROP CONSTRAINT bdr_nodes_node_status_check;
+
+ALTER TABLE bdr.bdr_nodes
+  ADD CONSTRAINT bdr_nodes_node_status_check
+    CHECK (node_status in ('b', 'i', 'c', 'o', 'r'));
+
+CREATE TABLE bdr_connections (
+    conn_sysid text not null,
+    conn_timeline oid not null,
+    conn_dboid oid not null,  -- This is an oid local to the node_sysid cluster
+
+    -- Wondering why there's no FOREIGN KEY to bdr.bdr_nodes?
+    -- bdr.bdr_nodes won't be populated when the bdr.bdr_connections
+    -- row gets created on the local node.
+
+    -- These fields may later be used by BDR to override connection
+    -- settings from one node to a particular other node. At the
+    -- moment their main use is for UDR connections, where we must
+    -- ensure that the connection is only made from one particular
+    -- node.
+    conn_origin_sysid text,
+    conn_origin_timeline oid,
+    conn_origin_dboid oid,
+
+    PRIMARY KEY(conn_sysid, conn_timeline, conn_dboid,
+                conn_origin_sysid, conn_origin_timeline, conn_origin_dboid),
+
+    -- Either a whole origin ID (for an override or UDR entry) or no
+    -- origin ID may be provided.
+    CONSTRAINT origin_all_or_none_null
+        CHECK ((conn_origin_sysid = '0') = (conn_origin_timeline = 0)
+           AND (conn_origin_sysid = '0') = (conn_origin_dboid = 0)),
+
+    -- Indicates that this connection is unidirectional; there won't be
+    -- a corresponding inbound connection from the peer node. Only permitted
+    -- where the conn_origin fields are set.
+    conn_is_unidirectional boolean not null default false,
+
+    CONSTRAINT unidirectional_conn_must_have_origin
+        CHECK ((NOT conn_is_unidirectional) OR (conn_origin_sysid <> '0')),
+
+    conn_dsn text not null,
+
+    conn_apply_delay integer
+        CHECK (conn_apply_delay >= 0),
+
+    conn_replication_sets text[]
+);
+
+REVOKE ALL ON TABLE bdr_connections FROM public;
+
+COMMENT ON TABLE bdr_connections IS 'Connection information for nodes in the group. Don''t modify this directly, use the provided functions. One entry should exist per node in the group.';
+
+COMMENT ON COLUMN bdr_connections.conn_sysid IS 'System identifer for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_timeline IS 'System timeline ID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_dboid IS 'System database OID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_origin_sysid IS 'If set, ignore this entry unless the local sysid is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_timeline IS 'If set, ignore this entry unless the local timeline is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_dboid IS 'If set, ignore this entry unless the local dboid is this';
+COMMENT ON COLUMN bdr_connections.conn_dsn IS 'A libpq-style connection string specifying how to make a connection to this node from other nodes.';
+COMMENT ON COLUMN bdr_connections.conn_apply_delay IS 'If set, milliseconds to wait before applying each transaction from the remote node. Mainly for debugging. If null, the global default applies.';
+COMMENT ON COLUMN bdr_connections.conn_replication_sets IS 'Replication sets this connection should participate in, if non-default.';
+
+SELECT pg_catalog.pg_extension_config_dump('bdr_connections', '');
+
+CREATE FUNCTION bdr_connections_changed()
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_connections_changed() FROM public;
+
+COMMENT ON FUNCTION bdr_connections_changed() IS 'Internal BDR function, do not call directly.';
+
+
+--
+-- This is a helper for node_join, for internal use only. It's called
+-- on the remote end by the init code when joining an existing group,
+-- to do the remote-side setup.
+--
+CREATE FUNCTION bdr.internal_node_join(
+    sysid text, timeline oid, dboid oid,
+    dsn text,
+    apply_delay integer,
+    replication_sets text[]
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+AS
+$body$
+DECLARE
+    status "char";
+BEGIN
+    LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+    LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+    IF bdr_variant() <> 'BDR' THEN
+        RAISE USING
+            MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+            DETAIL = 'The target node is running something other than full BDR so you cannot join a BDR node to it',
+            HINT = 'Install full BDR if possible or use the UDR functions.',
+            ERRCODE = 'feature_not_supported';
+    END IF;
+
+    -- Assert that we have a bdr_nodes entry with state = i on this node
+    SELECT INTO status
+    FROM bdr.bdr_nodes
+    WHERE node_sysid = sysid
+      AND node_timeline = timeline
+      AND node_dboid = dboid;
+
+    IF NOT FOUND THEN
+        RAISE object_not_in_prerequisite_state
+              USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) not found',
+                                     sysid, timeline, dboid);
+    END IF;
+
+    IF status <> 'i' THEN
+        RAISE object_not_in_prerequisite_state
+              USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) has unexpected status %L (expected ''i'')',
+                                     sysid, timeline, dboid, status);
+    END IF;
+
+    -- Insert or Update the connection info on this node, which we must be
+    -- initing from.
+    -- No need to care about concurrency here as we hold EXCLUSIVE LOCK.
+    BEGIN
+        INSERT INTO bdr.bdr_connections
+        (conn_sysid, conn_timeline, conn_dboid,
+         conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+         conn_dsn,
+         conn_apply_delay, conn_replication_sets,
+         conn_is_unidirectional)
+        VALUES
+        (sysid, timeline, dboid,
+         '0', 0, 0,
+         dsn,
+         CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+         replication_sets, false);
+    EXCEPTION WHEN unique_violation THEN
+        UPDATE bdr.bdr_connections
+        SET conn_dsn = dsn,
+            conn_apply_delay = CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+            conn_replication_sets = replication_sets,
+            conn_is_unidirectional = false
+        WHERE conn_sysid = sysid
+          AND conn_timeline = timeline
+          AND conn_dboid = dboid
+          AND conn_origin_sysid = '0'
+          AND conn_origin_timeline = 0
+          AND conn_origin_dboid = 0;
+    END;
+
+    -- Schedule the apply worker launch for commit time
+    PERFORM bdr.bdr_connections_changed();
+
+    -- and ensure the apply worker is launched on other nodes
+    -- when this transaction replicates there, too.
+    INSERT INTO bdr.bdr_queued_commands
+    (lsn, queued_at, perpetrator, command_tag, command)
+    VALUES
+    (pg_current_xlog_insert_location(), current_timestamp, current_user,
+    'SELECT', 'SELECT bdr.bdr_connections_changed()');
+END;
+$body$;
+
+
+CREATE FUNCTION bdr.internal_update_seclabel()
+RETURNS void LANGUAGE plpgsql
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    v_label json;
+BEGIN
+    -- Update 'bdr' parameter in the current label if there's one.
+    -- (Right now there's not much point to this but later we'll be
+    -- possibly have more information in there.)
+
+    -- first select existing label
+    SELECT label::json INTO v_label
+    FROM pg_catalog.pg_shseclabel
+    WHERE provider = 'bdr'
+      AND classoid = 'pg_database'::regclass
+      AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+    -- then replace 'bdr' with 'bdr'::true
+    SELECT json_object_agg(key, value) INTO v_label
+    FROM (
+        SELECT key, value
+        FROM json_each(v_label)
+        WHERE key <> 'bdr'
+      UNION ALL
+        SELECT 'bdr', to_json(true)
+    ) d;
+
+    -- and set the newly computed label
+    -- (It's safe to do this early, it won't take effect
+    -- until commit)
+    EXECUTE format('SECURITY LABEL FOR bdr ON DATABASE %I IS %L',
+                   current_database(), v_label);
+END;
+$body$;
+
+-- Setup that's common to BDR and UDR joins
+CREATE FUNCTION bdr.internal_begin_join(caller text, local_dsn text, remote_dsn text,
+    remote_sysid OUT text, remote_timeline OUT oid, remote_dboid OUT oid
+)
+RETURNS record LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid RECORD;
+    localid_from_dsn RECORD;
+    remote_nodeinfo RECORD;
+BEGIN
+    -- Only one tx can be adding connections
+    LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+    LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;
+    LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    -- If there's already an entry for ourselves in bdr.bdr_connections
+    -- then we know this node is part of an active BDR group and cannot
+    -- be joined to another group. Unidirectional connections are ignored.
+    PERFORM 1 FROM bdr_connections
+    WHERE conn_sysid = localid.sysid
+      AND conn_timeline = localid.timeline
+      AND conn_dboid = localid.dboid
+      AND (conn_origin_sysid = '0'
+           AND conn_origin_timeline = 0
+           AND conn_origin_dboid = 0)
+      AND conn_is_unidirectional = 'f';
+
+    IF FOUND THEN
+        RAISE USING
+            MESSAGE = 'This node is already a member of a BDR group',
+            HINT = 'Connect to the node you wish to add and run '||caller||' from it instead',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Validate that the local connection is usable and matches
+    -- the node identity of the node we're running on.
+    --
+    -- For BDR this will NOT check the 'dsn' if 'local_dsn'
+    -- gets supplied. We don't know if 'dsn' is even valid
+    -- for loopback connections and can't assume it is. That'll
+    -- get checked later by BDR specific code.
+    SELECT * INTO localid_from_dsn
+    FROM bdr_get_remote_nodeinfo(local_dsn);
+
+    IF localid_from_dsn.sysid <> localid.sysid
+        OR localid_from_dsn.timeline <> localid.timeline
+        OR localid_from_dsn.dboid <> localid.dboid
+    THEN
+        RAISE USING
+            MESSAGE = 'node identity for local dsn does not match current node',
+            DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+                local_dsn, localid_from_dsn.sysid, localid_from_dsn.timeline,
+                localid_from_dsn.dboid, localid.sysid, localid.timeline, localid.dboid),
+            HINT = 'The local_dsn (or, for bdr, dsn if local_dsn is null) parameter must refer to the node you''re running this function from',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    IF NOT localid_from_dsn.is_superuser THEN
+        RAISE USING
+            MESSAGE = 'local dsn does not have superuser rights',
+            DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, local_dsn),
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Now interrogate the remote node, if specified, and sanity
+    -- check its connection too. The discovered node identity is
+    -- returned if found.
+    --
+    -- This will error out if there are issues with the remote
+    -- node.
+    IF remote_dsn IS NOT NULL THEN
+        SELECT * INTO remote_nodeinfo
+        FROM bdr_get_remote_nodeinfo(remote_dsn);
+
+        remote_sysid := remote_nodeinfo.sysid;
+        remote_timeline := remote_nodeinfo.timeline;
+        remote_dboid := remote_nodeinfo.dboid;
+
+        IF NOT remote_nodeinfo.is_superuser THEN
+            RAISE USING
+                MESSAGE = 'connection to remote node does not have superuser rights',
+                DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, remote_dsn),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF remote_nodeinfo.version_num < bdr_min_remote_version_num() THEN
+            RAISE USING
+                MESSAGE = 'remote node''s BDR version is too old',
+                DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s is less than the required version %s$$,
+                    remote_dsn, remote_nodeinfo.version_num, bdr_min_remote_version_num()),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF remote_nodeinfo.min_remote_version_num > bdr_version_num() THEN
+            RAISE USING
+                MESSAGE = 'remote node''s BDR version is too new or this node''s version is too old',
+                DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s requires this node to run at least bdr %s, not the current %s$$,
+                    remote_dsn, remote_nodeinfo.version_num, remote_nodeinfo.min_remote_version_num,
+                    bdr_min_remote_version_num()),
+                ERRCODE = 'object_not_in_prerequisite_state';
+
+        END IF;
+
+    END IF;
+
+    -- Create local node record if needed
+    PERFORM 1 FROM bdr_nodes
+    WHERE node_sysid = localid.sysid
+      AND node_timeline = localid.timeline
+      AND node_dboid = localid.dboid;
+
+    IF NOT FOUND THEN
+        INSERT INTO bdr_nodes (
+            node_sysid, node_timeline, node_dboid,
+            node_status, node_local_dsn, node_init_from_dsn
+        ) VALUES (
+            localid.sysid, localid.timeline, localid.dboid,
+            'b', local_dsn, remote_dsn
+        );
+    END IF;
+
+    PERFORM bdr.internal_update_seclabel();
+END;
+$body$;
+
+--
+-- The public interface for node join/addition, to be run to join a currently
+-- unconnected node with a blank database to a BDR group.
+--
+CREATE FUNCTION bdr.bdr_group_join(
+    dsn text,
+    init_from_dsn text,
+    local_dsn text DEFAULT NULL,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid record;
+    connectback_nodeinfo record;
+    remoteinfo record;
+BEGIN
+    IF dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'dsn may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    IF bdr_variant() <> 'BDR' THEN
+        RAISE USING
+            MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+            DETAIL = 'The local node is not running full BDR, which is required to use bdr_join',
+            HINT = 'Install full BDR if possible or use the UDR functions.',
+            ERRCODE = 'feature_not_supported';
+    END IF;
+
+    PERFORM bdr.internal_begin_join(
+        'bdr_group_join',
+        CASE WHEN local_dsn IS NULL THEN dsn ELSE local_dsn END,
+        init_from_dsn);
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    -- Request additional connection tests to determine that the remote is
+    -- reachable for replication and non-replication mode and that the remote
+    -- can connect back to us via 'dsn' on non-replication and replication
+    -- modes.
+    --
+    -- This cannot be checked for the first node since there's no peer
+    -- to ask for help.
+    IF init_from_dsn IS NOT NULL THEN
+
+        SELECT * INTO connectback_nodeinfo
+        FROM bdr.bdr_test_remote_connectback(init_from_dsn, dsn);
+
+        -- The connectback must actually match our local node identity
+        -- and must provide a superuser connection.
+        IF NOT connectback_nodeinfo.is_superuser THEN
+            RAISE USING
+                MESSAGE = 'dsn does not have superuser rights when connecting via remote node',
+                DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, dsn),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF connectback_nodeinfo.sysid <> localid.sysid
+           OR connectback_nodeinfo.timeline <> localid.timeline
+           OR connectback_nodeinfo.dboid <> localid.dboid
+        THEN
+            RAISE USING
+                MESSAGE = 'node identity for dsn does not match current node when connecting back via remote',
+                DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+                    local_dsn, connectback_nodeinfo.sysid, connectback_nodeinfo.timeline,
+                    connectback_nodeinfo.dboid, localid.sysid, localid.timeline, localid.dboid),
+                HINT = 'The ''dsn'' parameter must refer to the node you''re running this function from, from the perspective of the node pointed to by init_from_dsn',
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+    END IF;
+
+    -- Null/empty checks are skipped, the underlying constraints on the table
+    -- will catch that for us.
+    INSERT INTO bdr.bdr_connections (
+        conn_sysid, conn_timeline, conn_dboid,
+        conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+        conn_dsn, conn_apply_delay, conn_replication_sets,
+        conn_is_unidirectional
+    ) VALUES (
+        localid.sysid, localid.timeline, localid.dboid,
+        '0', 0, 0,
+        dsn, apply_delay, replication_sets, false
+    );
+
+    -- Now ensure the per-db worker is started if it's not already running.
+    -- This won't actually take effect until commit time, it just adds a commit
+    -- hook to start the worker when we commit.
+    PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_join(text,text,text,integer,text[])
+IS 'Join an existing BDR group by connecting to a member node and copying its contents';
+
+CREATE FUNCTION bdr.bdr_group_create(
+    dsn text,
+    local_dsn text DEFAULT NULL,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+BEGIN
+    PERFORM bdr.bdr_group_join(
+        dsn, init_from_dsn := null, local_dsn := local_dsn,
+        apply_delay := apply_delay,
+        replication_sets := replication_sets);
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_create(text,text,integer,text[])
+IS 'Create a BDR group, turning a stand-alone database into the first node in a BDR group';
+
+--
+-- The public interface for unidirectional replication setup.
+--
+CREATE FUNCTION bdr.bdr_subscribe(
+    remote_dsn text,
+    local_dsn text,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid record;
+    remoteid record;
+BEGIN
+    IF local_dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'local_dsn may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    IF remote_dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'remote may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    SELECT remote_sysid AS sysid, remote_timeline AS timeline,
+           remote_dboid AS dboid INTO remoteid
+    FROM bdr.internal_begin_join('bdr_subscribe', local_dsn, remote_dsn);
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    PERFORM 1 FROM bdr_connections
+    WHERE conn_sysid = remoteid.sysid
+      AND conn_timeline = remoteid.timeline
+      AND conn_dboid = remoteid.dboid
+      AND conn_origin_sysid = localid.sysid
+      AND conn_origin_timeline = localid.timeline
+      AND conn_origin_dboid = localid.dboid
+      AND conn_is_unidirectional = 't';
+
+    IF FOUND THEN
+        RAISE USING
+            MESSAGE = 'This node is already connected to given remote node',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Null/empty checks are skipped, the underlying constraints on the table
+    -- will catch that for us.
+    INSERT INTO bdr.bdr_connections (
+        conn_sysid, conn_timeline, conn_dboid,
+        conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+        conn_dsn, conn_apply_delay, conn_replication_sets,
+        conn_is_unidirectional
+    ) VALUES (
+        remoteid.sysid, remoteid.timeline, remoteid.dboid,
+        localid.sysid, localid.timeline, localid.dboid,
+        remote_dsn, apply_delay, replication_sets, true
+    );
+
+    -- Now ensure the per-db worker is started if it's not already running.
+    -- This won't actually take effect until commit time, it just adds a commit
+    -- hook to start the worker when we commit.
+    PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_subscribe(text,text,integer,text[])
+IS 'Subscribe to remote logical changes';
+
+CREATE FUNCTION bdr.bdr_node_join_wait_for_ready()
+RETURNS void LANGUAGE plpgsql VOLATILE AS $body$
+DECLARE
+    _node_status "char";
+BEGIN
+    IF current_setting('transaction_isolation') <> 'read committed' THEN
+        RAISE EXCEPTION 'Can only wait for node join in an ISOLATION LEVEL READ COMMITTED transaction, not %',
+                        current_setting('transaction_isolation');
+    END IF;
+
+    LOOP
+        SELECT INTO _node_status
+          node_status
+        FROM bdr.bdr_nodes
+        WHERE (node_sysid, node_timeline, node_dboid)
+              = bdr.bdr_get_local_nodeid();
+
+    PERFORM pg_sleep(0.5);
+
+        EXIT WHEN _node_status = 'r';
+    END LOOP;
+END;
+$body$;
+
+CREATE FUNCTION bdr_upgrade_to_090(my_conninfo cstring, local_conninfo cstring, remote_conninfo cstring)
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring) FROM public;
+
+COMMENT ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring)
+IS 'Upgrade a BDR 0.7.x or 0.8.x node to BDR 0.9.0 dynamic configuration. remote_conninfo is the node to connect to to perform the upgrade, my_conninfo is the dsn for other nodes to connect to this node with, local_conninfo is used to connect locally back to the node. Use null remote conninfo on the first node.';
+
+RESET bdr.permit_unsafe_ddl_commands;
+RESET bdr.skip_ddl_replication;
+RESET search_path;
diff --git a/scripts/bdr_initial_load.in b/scripts/bdr_initial_load.in

index 50eb1c20fc93ee002bd881861e6810c694080ab3..09459fa7791621813105499b94f5ab18053cabac 100644 (file)
--- a/scripts/bdr_initial_load.in
+++ b/scripts/bdr_initial_load.in
@@ -11,7 +11,7 @@
  
  errlog()
  {
-    echo "$@" 1>&2
+   echo "$@" 1>&2
  }
  
  JOBS=1
@@ -23,78 +23,78 @@ PGDUMP=
  PGRESTORE=
  
  while (($i < ${#argv[*]})); do
-    case "${argv[$i]}" in
+   case "${argv[$i]}" in
     -V)
-       echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
-       exit
+       echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
+       exit
     ;;
-        --snapshot)
-            ((i++)); SNAPSHOT="${argv[$i]}"
-        ;;
-        --source)
-            ((i++)); SOURCE="${argv[$i]}"
-        ;;
-        --target)
-            ((i++)); TARGET="${argv[$i]}"
-        ;;
-        --tmp-directory)
-            ((i++)); TMPDIR="${argv[$i]}"
-        ;;
-        --jobs)
-            ((i++)); JOBS="${argv[$i]}"
+   --snapshot)
+       ((i++)); SNAPSHOT="${argv[$i]}"
     ;;
-        --pg-dump-path)
-            ((i++)); PGDUMP="${argv[$i]}"
-        ;;
-        --pg-restore-path)
-            ((i++)); PGRESTORE="${argv[$i]}"
-        ;;
-        --help)
-            errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
-            errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
-            exit 0
-        ;;
-        *)
-            errlog Unknown command-line option: ${argv[$i]}
-            exit 1
-        ;;
-    esac
+   --source)
+       ((i++)); SOURCE="${argv[$i]}"
+   ;;
+   --target)
+       ((i++)); TARGET="${argv[$i]}"
+   ;;
+   --tmp-directory)
+       ((i++)); TMPDIR="${argv[$i]}"
+   ;;
+   --jobs)
+       ((i++)); JOBS="${argv[$i]}"
+   ;;
+   --pg-dump-path)
+       ((i++)); PGDUMP="${argv[$i]}"
+   ;;
+   --pg-restore-path)
+       ((i++)); PGRESTORE="${argv[$i]}"
+   ;;
+   --help)
+       errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
+       errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
+       exit 0
+   ;;
+   *)
+       errlog Unknown command-line option: ${argv[$i]}
+       exit 1
+   ;;
+   esac
  
-    ((i++))
+   ((i++))
  done
  
  if [ -z "$SOURCE" ]; then
-    errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
+   errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
  fi
  
  if [ -z "$TARGET" ]; then
-    errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
+   errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
  fi
  
  if [ -z "$TMPDIR" ]; then
-    errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
+   errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
  fi
  
  if [ -z "$PGDUMP" ]; then
-    errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+   errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
  fi
  
  if [ -z "$PGRESTORE" ]; then
-    errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+   errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
  fi
  
  SNAP=${SNAPSHOT:+"--snapshot $SNAPSHOT"}
  
  errlog "Dumping remote database \"$SOURCE\" with $JOBS concurrent workers to \"$TMPDIR\""
-if ! "$PGDUMP" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
-    errlog "bdr_dump of "$SOURCE" failed, aborting"
-    exit 1
+if ! "$PGDUMP" -T "bdr.bdr_nodes" -T "bdr.bdr_connections" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
+   errlog "bdr_dump of "$SOURCE" failed, aborting"
+   exit 1
  fi
  
  errlog "Restoring dump to local DB \"$TARGET\" with $JOBS concurrent workers from \"$TMPDIR\""
-if ! "$PGRESTORE" --exit-on-error --single-transaction -j $JOBS -F d -d "$TARGET" $TMPDIR; then
-    errlog "pg_restore to "$TARGET" failed, aborting"
-    exit 2
+if ! "$PGRESTORE" --exit-on-error -j $JOBS -F d -d "$TARGET" $TMPDIR; then
+   errlog "pg_restore to "$TARGET" failed, aborting"
+   exit 2
  fi
  
  exit 0
diff --git a/specs/isolation/init.spec b/specs/isolation/init.spec

new file mode 100644 (file)

index 0000000..1b824f0
--- /dev/null
+++ b/specs/isolation/init.spec
@@ -0,0 +1,127 @@
+conninfo "node1" "dbname=node1"
+conninfo "node2" "dbname=node2"
+conninfo "node3" "dbname=node3"
+
+session "snode1"
+
+# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
+# say how many nodes must be present.  It'll succeed if there are zero nodes.
+# So we first have to wait for enough replication connections.
+#
+# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
+# cached when first accessed so repeat access within the same transaction sees
+# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
+# pg_stat_activity, new walsenders are filtered out by the join unles we force
+# a refresh of pg_stat_activity.
+
+connection "node1"
+
+step "setup1"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_root"
+{
+   SELECT bdr.bdr_group_create(
+       dsn := 'dbname=node1'
+       );
+}
+
+step "wait"
+{
+   -- pg_xlog_wait_remote_apply isn't good enough alone
+   -- as it doesn't permit us to say how many nodes must be present.
+   -- It'll succeed if there are zero nodes. So we first have to wait
+   -- for enough replication connections.
+   DO $$
+   DECLARE
+       nodecount integer := 0;
+       target_lsn pg_lsn;
+   BEGIN
+       WHILE nodecount <> 6
+       LOOP
+           PERFORM pg_sleep(1);
+           PERFORM pg_stat_clear_snapshot();
+           -- Now find out how many walsenders are running
+           nodecount := (SELECT count(*)
+                         FROM pg_catalog.pg_stat_replication);
+           RAISE NOTICE 'Found % nodes',nodecount;
+       END LOOP;
+       -- OK, all nodes seen, now we wait for catchup on them all.
+       target_lsn := pg_current_xlog_location();
+       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+       RAISE NOTICE 'Catchup to LSN completed';
+   END;
+   $$;
+}
+
+session "snode2"
+connection "node2"
+
+step "setup2"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_2"
+{
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node2',
+       init_from_dsn := 'dbname=node1'
+       );
+}
+
+step "wait_join_2"
+{
+   SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_2"
+{
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+session "snode3"
+connection "node3"
+
+step "setup3"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_3"
+{
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node3',
+       init_from_dsn := 'dbname=node1',
+       local_dsn := 'dbname=node3'
+       );
+}
+
+step "wait_join_3"
+{
+   SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_3"
+{
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+permutation "setup1" "setup2" "setup3" "join_root" "join_2" "wait_join_2" "check_join_2" "join_3" "wait_join_3" "check_join_3" "wait"
diff --git a/specs/isolation/waitforstart.spec b/specs/isolation/waitforstart.spec

deleted file mode 100644 (file)

index b5ef85c..0000000
--- a/specs/isolation/waitforstart.spec
+++ /dev/null
@@ -1,42 +0,0 @@
-conninfo "node1" "dbname=node1"
-conninfo "node2" "dbname=node2"
-conninfo "node3" "dbname=node3"
-
-session "snode1"
-
-# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
-# say how many nodes must be present.  It'll succeed if there are zero nodes.
-# So we first have to wait for enough replication connections.
-#
-# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
-# cached when first accessed so repeat access within the same transaction sees
-# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
-# pg_stat_activity, new walsenders are filtered out by the join unles we force
-# a refresh of pg_stat_activity.
-
-step "wait"
-{
-   DO $$
-   DECLARE
-       nodecount integer := 0;
-       target_lsn pg_lsn;
-   BEGIN
-       WHILE nodecount <> 6
-       LOOP
-           PERFORM pg_sleep(1);
-           PERFORM pg_stat_clear_snapshot();
-           -- Now find out how many walsenders are running
-           nodecount := (SELECT count(*)
-                         FROM pg_catalog.pg_stat_replication);
-           RAISE NOTICE 'Found % nodes',nodecount;
-       END LOOP;
-       -- OK, all nodes seen, now we wait for catchup on them all.
-       target_lsn := pg_current_xlog_location();
-       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
-       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
-       RAISE NOTICE 'Catchup to LSN completed';
-   END;
-   $$;
-}
-
-permutation "wait"
diff --git a/sql/ddl/create.sql b/sql/ddl/create.sql

index e47c6f180b489c399613cb6ae450463ede7245fc..19140e54e59feb8adcb4a5d6a561191f9f728aa0 100644 (file)
--- a/sql/ddl/create.sql
+++ b/sql/ddl/create.sql
@@ -212,6 +212,7 @@ CREATE TABLE tbl_with_oids() WITH OIDS;
  CREATE TABLE tbl_without_oids() WITHOUT oids;
  DROP TABLE tbl_without_oids;
  SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
  
  --- AGGREGATE ---
  \c postgres
diff --git a/sql/init.sql b/sql/init.sql

index 10271198731d4458481e9d7f9cb33bee084d2a21..1a7c1de30f8000257436b3de6840d00c1f398ed7 100644 (file)
--- a/sql/init.sql
+++ b/sql/init.sql
@@ -12,37 +12,10 @@ GRANT ALL ON SCHEMA public TO nonsuper;
  \c regression
  GRANT ALL ON SCHEMA public TO nonsuper;
  
-SELECT pg_sleep(10);
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
-   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
-   IF FOUND THEN
-       RETURN;
-   END IF;
-
-   PERFORM bdr.bdr_replicate_ddl_command($DDL$
-       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
-       AS $FUNC$
-       BEGIN
-           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
-               PERFORM pg_sleep(0.01);
-           END LOOP;
-       END;$FUNC$ LANGUAGE plpgsql;
-   $DDL$);
-END;$DO$;
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
  
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
-    OUT readdb1 text,
-    OUT readdb2 text,
-    OUT writedb1 text,
-    OUT writedb2 text
-    ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
-    current_setting('bdrtest.readdb1'),
-    current_setting('bdrtest.readdb2'),
-    current_setting('bdrtest.writedb1'),
-    current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
diff --git a/sql/init_bdr.sql b/sql/init_bdr.sql

new file mode 100644 (file)

index 0000000..6f584bc
--- /dev/null
+++ b/sql/init_bdr.sql
@@ -0,0 +1,44 @@
+\c postgres
+SELECT bdr.bdr_group_create(
+   dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-1']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+\c regression
+SELECT bdr.bdr_group_join(
+   dsn := 'dbname=regression',
+   init_from_dsn := 'dbname=postgres',
+   local_dsn := 'dbname=regression',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
diff --git a/sql/init_udr.sql b/sql/init_udr.sql

new file mode 100644 (file)

index 0000000..2e3a62b
--- /dev/null
+++ b/sql/init_udr.sql
@@ -0,0 +1,52 @@
+\c postgres
+SELECT bdr.bdr_subscribe(
+   remote_dsn := 'dbname=regression',
+   local_dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+   IF FOUND THEN
+       RETURN;
+   END IF;
+
+   PERFORM bdr.bdr_replicate_ddl_command($DDL$
+       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+       AS $FUNC$
+       BEGIN
+           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+               PERFORM pg_sleep(0.01);
+           END LOOP;
+       END;$FUNC$ LANGUAGE plpgsql;
+   $DDL$);
+END;$DO$;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
diff --git a/sql/upgrade.sql b/sql/upgrade.sql

index 648cb754ef35c555f4ebe5216169ca050defead0..b0a4209511041c2a99e4bc09ecfcb42596ff21c0 100644 (file)
--- a/sql/upgrade.sql
+++ b/sql/upgrade.sql
@@ -36,6 +36,9 @@ DROP EXTENSION bdr;
  CREATE EXTENSION bdr VERSION '0.9.0.0';
  DROP EXTENSION bdr;
  
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
+
  -- evolve version one by one from the oldest to the newest one
  CREATE EXTENSION bdr VERSION '0.8.0';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
@@ -46,6 +49,7 @@ ALTER EXTENSION bdr UPDATE TO '0.8.0.5';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
  ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
  ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
  
  
  -- Should never have to do anything: You missed adding the new version above.
diff --git a/sql/upgrade_sim_0800.sql b/sql/upgrade_sim_0800.sql

new file mode 100644 (file)

index 0000000..c84f70f
--- /dev/null
+++ b/sql/upgrade_sim_0800.sql
@@ -0,0 +1,183 @@
+--
+-- Attempt to simulate an upgrade from BDR 0.8.0 to the current
+-- version.
+--
+-- 0.8.0 used GUCs for bdr.connections DSN configuration, etc. We can manually
+-- create the slots, replication identifiers, and bdr.bdr_nodes entries as if
+-- this was a 0.8.0 DB just about to be upgraded, then upgrade the extension
+-- and execute the upgrade process.
+--
+
+
+CREATE DATABASE upgrade_sim_0800_a;
+CREATE DATABASE upgrade_sim_0800_b;
+
+\c upgrade_sim_0800_a;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_a      --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+-- public.bdr_get_local_nodeid() is defined in the bdr ext's C lib
+-- exposed in 0.8.0's SQL extension. We have to use it to create
+-- the required slots etc, so create it in public.
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+   id integer primary key,
+   somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       ''
+   )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+   sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+    public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       ''
+   ),
+   'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+
+\c upgrade_sim_0800_b;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_b      --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+   id integer primary key,
+   somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       ''
+   )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+   sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+    public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       ''
+   ),
+   'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+------------------------------------------
+-- Test the upgrade                     --
+------------------------------------------
+--
+-- We now have two databases that look like they were running BDR, with
+-- contents in sync at the time of upgrade. The origin replication identifier
+-- information is wrong as both have InvalidRepNodeId but we don't really care
+-- about that. It's as if we deleted bdr.bdr_connections then started the DB
+-- up.
+--
+-- Time to upgrade to dynconf. Hope this works!
+--
+
+-- First the extension must be updated on BOTH nodes
+\c upgrade_sim_0800_a
+ALTER EXTENSION bdr UPDATE;
+\c upgrade_sim_0800_b
+ALTER EXTENSION bdr UPDATE;
+
+
+-- then one must be upgraded standalone. For this one we'll provide no local
+-- dsn; it must be inferred from the node dsn in that case. There's also no
+-- remote DSN since it's the first node.
+\c upgrade_sim_0800_a
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_a', NULL, NULL);
+
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+  AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+
+-- Upgrade the second node using the first node. This time we'll
+-- supply a local dsn too, though it'll be the same.
+\c upgrade_sim_0800_b
+
+-- must have old nodes, no replication can have occurred
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_a');
+
+-- local node must be updated. Remote node could be either as replication
+-- might or might not have sent it yet.
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+WHERE datname = current_database()
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+  AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+-- TODO: wait for remote apply, switch back
+
+-- TODO: use test table
+
+-- TODO: lots of failure cases
author	Craig Ringer <craig@2ndquadrant.com>
	Sat, 7 Feb 2015 11:54:17 +0000 (00:54 +1300)
committer	Andres Freund <andres@anarazel.de>
	Thu, 12 Feb 2015 09:16:58 +0000 (10:16 +0100)
Makefile.in		patch \| blob \| blame \| history
bdr.c		patch \| blob \| blame \| history
bdr.control		patch \| blob \| blame \| history
bdr.h		patch \| blob \| blame \| history
bdr_apply.c		patch \| blob \| blame \| history
bdr_catalogs.c		patch \| blob \| blame \| history
bdr_common.c	[new file with mode: 0644]	patch \| blob
bdr_conflict_logging.c		patch \| blob \| blame \| history
bdr_executor.c		patch \| blob \| blame \| history
bdr_init_copy.c		patch \| blob \| blame \| history
bdr_init_replica.c		patch \| blob \| blame \| history
bdr_internal.h		patch \| blob \| blame \| history
bdr_isolationregress.conf		patch \| blob \| blame \| history
bdr_label.c		patch \| blob \| blame \| history
bdr_label.h		patch \| blob \| blame \| history
bdr_locks.c		patch \| blob \| blame \| history
bdr_locks.h		patch \| blob \| blame \| history
bdr_output.c		patch \| blob \| blame \| history
bdr_perdb.c		patch \| blob \| blame \| history
bdr_regress_bdr.conf		patch \| blob \| blame \| history
bdr_relcache.c		patch \| blob \| blame \| history
bdr_seq.c		patch \| blob \| blame \| history
bdr_supervisor.c	[new file with mode: 0644]	patch \| blob
bdr_upgrade.c	[new file with mode: 0644]	patch \| blob
expected/ddl/create.out		patch \| blob \| blame \| history
expected/init.out		patch \| blob \| blame \| history
expected/init_bdr.out	[new file with mode: 0644]	patch \| blob
expected/init_udr.out	[new file with mode: 0644]	patch \| blob
expected/isolation/init.out	[new file with mode: 0644]	patch \| blob
expected/isolation/waitforstart.out	[deleted file]	patch \| blob \| blame \| history
expected/upgrade.out		patch \| blob \| blame \| history
extsql/bdr--0.8.0.sql		patch \| blob \| blame \| history
extsql/bdr--0.9.0.0--0.9.0.1.sql	[new file with mode: 0644]	patch \| blob
scripts/bdr_initial_load.in		patch \| blob \| blame \| history
specs/isolation/init.spec	[new file with mode: 0644]	patch \| blob
specs/isolation/waitforstart.spec	[deleted file]	patch \| blob \| blame \| history
sql/ddl/create.sql		patch \| blob \| blame \| history
sql/init.sql		patch \| blob \| blame \| history
sql/init_bdr.sql	[new file with mode: 0644]	patch \| blob
sql/init_udr.sql	[new file with mode: 0644]	patch \| blob
sql/upgrade.sql		patch \| blob \| blame \| history
sql/upgrade_sim_0800.sql	[new file with mode: 0644]	patch \| blob