bdr: Replace GUC-based connections with SQL and tables
authorCraig Ringer <craig@2ndquadrant.com>
Sat, 7 Feb 2015 11:54:17 +0000 (00:54 +1300)
committerAndres Freund <andres@anarazel.de>
Thu, 12 Feb 2015 09:16:58 +0000 (10:16 +0100)
BDR's configuration interface and the related internals have been mostly
rewritten so that GUCs are not used for specifying bdr connections.

The bdr.connectons GUC and bdr.<nodename>_dsn etc are removed by this
commit. Use of these settings will now emit warnings in the server error
log.

Upgrades to the new version must be performed with manual steps covered
in the documentation. Simply replacing the binaries is not sufficient.

Major changes:

* There is only one static worker, a new bdr_supervisor worker

* The supervisor decides which databases to start per-db workers
  for by examining each database for a 'bdr' security label

* per-db workers scan a new bdr.bdr_connections table to decide
  which nodes to launch apply workers for.

* apply workers look up bdr.bdr_connections to get connection
  info instead of checking GUCs

* Explicit creation of the first node is now required with
  bdr.bdr_group_create(...)

* New nodes must be joined with an SQL level bdr.bdr_group_join(...)
  call.

42 files changed:
Makefile.in
bdr.c
bdr.control
bdr.h
bdr_apply.c
bdr_catalogs.c
bdr_common.c [new file with mode: 0644]
bdr_conflict_logging.c
bdr_executor.c
bdr_init_copy.c
bdr_init_replica.c
bdr_internal.h
bdr_isolationregress.conf
bdr_label.c
bdr_label.h
bdr_locks.c
bdr_locks.h
bdr_output.c
bdr_perdb.c
bdr_regress_bdr.conf
bdr_relcache.c
bdr_seq.c
bdr_supervisor.c [new file with mode: 0644]
bdr_upgrade.c [new file with mode: 0644]
expected/ddl/create.out
expected/init.out
expected/init_bdr.out [new file with mode: 0644]
expected/init_udr.out [new file with mode: 0644]
expected/isolation/init.out [new file with mode: 0644]
expected/isolation/waitforstart.out [deleted file]
expected/upgrade.out
extsql/bdr--0.8.0.sql
extsql/bdr--0.9.0.0--0.9.0.1.sql [new file with mode: 0644]
scripts/bdr_initial_load.in
specs/isolation/init.spec [new file with mode: 0644]
specs/isolation/waitforstart.spec [deleted file]
sql/ddl/create.sql
sql/init.sql
sql/init_bdr.sql [new file with mode: 0644]
sql/init_udr.sql [new file with mode: 0644]
sql/upgrade.sql
sql/upgrade_sim_0800.sql [new file with mode: 0644]

index e92e62f18ecd076948d5dd81898d03b9233c6e2d..318c063f1be03956fba13b3087a37030234346fa 100644 (file)
@@ -20,7 +20,8 @@ DATA = \
    extsql/bdr--0.8.0.4--0.8.0.5.sql \
    extsql/bdr--0.8.0.5--0.8.0.6.sql \
    extsql/bdr--0.8.0.6--0.8.0.7.sql \
-   extsql/bdr--0.8.0.7--0.9.0.0.sql
+   extsql/bdr--0.8.0.7--0.9.0.0.sql \
+   extsql/bdr--0.9.0.0--0.9.0.1.sql
 
 DATA_built = \
    extsql/bdr--0.8.0.1.sql \
@@ -30,7 +31,8 @@ DATA_built = \
    extsql/bdr--0.8.0.5.sql \
    extsql/bdr--0.8.0.6.sql \
    extsql/bdr--0.8.0.7.sql \
-   extsql/bdr--0.9.0.0.sql
+   extsql/bdr--0.9.0.0.sql \
+   extsql/bdr--0.9.0.1.sql
 
 DOCS = bdr.conf.sample README.bdr
 SCRIPTS = scripts/bdr_initial_load bdr_init_copy bdr_resetxlog bdr_dump
@@ -47,6 +49,7 @@ OBJS = \
    bdr_conflict_handlers.o \
    bdr_conflict_logging.o \
    bdr_commandfilter.o \
+   bdr_common.o \
    bdr_compat.o \
    bdr_count.o \
    bdr_executor.o \
@@ -55,7 +58,9 @@ OBJS = \
    bdr_locks.o \
    bdr_output.o \
    bdr_relcache.o \
-   bdr_remotecalls.o
+   bdr_remotecalls.o \
+   bdr_supervisor.o \
+   bdr_upgrade.o
 
 ifeq "@BUILDING_BDR@" "1"
 OBJS += \
@@ -91,7 +96,6 @@ bdr_init_copy: bdr_init_copy.o
    $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(libpq_pgport) $(LIBS) -o $@$(X)
 
 scripts/bdr_initial_load: scripts/bdr_initial_load.in
-   mkdir -p scripts
    sed -e "s/BDR_VERSION/$(BDR_VERSION)/" -e "s/PG_VERSION/$(VERSION)/" $< > $@
 
 extsql/bdr--0.8.0.1.sql: extsql/bdr--0.8.0.sql extsql/bdr--0.8.0--0.8.0.1.sql
@@ -124,6 +128,11 @@ extsql/bdr--0.8.0.7.sql: extsql/bdr--0.8.0.6.sql extsql/bdr--0.8.0.6--0.8.0.7.sq
    cat $^ > $@
 
 extsql/bdr--0.9.0.0.sql: extsql/bdr--0.8.0.7.sql extsql/bdr--0.8.0.7--0.9.0.0.sql
+   mkdir -p extsql
+   cat $^ > $@
+
+extsql/bdr--0.9.0.1.sql: extsql/bdr--0.9.0.0.sql extsql/bdr--0.9.0.0--0.9.0.1.sql
+   mkdir -p extsql
    cat $^ > $@
 
 bdr_resetxlog: pg_resetxlog.o
@@ -175,13 +184,17 @@ check: regresscheck isolationcheck
 DDLREGRESSCHECKS=ddl/create ddl/alter_table ddl/extension ddl/function \
                 ddl/grant ddl/mixed ddl/namespace ddl/replication_set \
                 ddl/sequence ddl/view
+REGRESSINIT=init_bdr
 else
 check: regresscheck
 DDLREGRESSCHECKS=
+REGRESSINIT=init_udr
 endif
 
+
 REGRESSCHECKS= \
    init \
+   $(REGRESSINIT) \
    upgrade \
    identifier \
    $(DDLREGRESSCHECKS) \
@@ -189,7 +202,7 @@ REGRESSCHECKS= \
 
 
 ISOLATIONCHECKS=\
-   isolation/waitforstart \
+   isolation/init \
    isolation/ddlconflict \
    isolation/dmlconflict_ii \
    isolation/dmlconflict_uu \
diff --git a/bdr.c b/bdr.c
index 276fa1c705a98d969985dc9b7bcc63071596fc9a..8090d57760bbbd8b07e03e74e73a7cc3895f96e1 100644 (file)
--- a/bdr.c
+++ b/bdr.c
@@ -71,23 +71,18 @@ extern Oid          origin_dboid;
 /* end externs for bdr apply state */
 
 ResourceOwner bdr_saved_resowner;
-static bool bdr_is_restart = false;
 Oid   BdrNodesRelid;
 Oid   BdrConflictHistoryRelId;
 Oid   BdrLocksRelid;
 Oid   BdrLocksByOwnerRelid;
 Oid   BdrReplicationSetConfigRelid;
 
-BdrConnectionConfig  **bdr_connection_configs;
-/* All databases for which BDR is configured, valid after _PG_init */
-char **bdr_distinct_dbnames;
-uint32 bdr_distinct_dbnames_count = 0;
-
 /* GUC storage */
 static char *connections = NULL;
 static bool bdr_synchronous_commit;
 int bdr_default_apply_delay;
 int bdr_max_workers;
+int bdr_max_databases;
 static bool bdr_skip_ddl_replication;
 bool bdr_skip_ddl_locking;
 bool bdr_do_not_replicate;
@@ -101,11 +96,17 @@ BdrWorkerType bdr_worker_type = BDR_WORKER_EMPTY_SLOT;
 /* shortcut for finding the the worker shmem block */
 BdrWorkerControl *BdrWorkerCtl = NULL;
 
+/* This worker's block within BdrWorkerCtl - only valid in bdr workers */
+BdrWorker  *bdr_worker_slot = NULL;
+
+/* Worker generation number; see bdr_worker_shmem_startup comments */
+static uint16 bdr_worker_generation;
+
+
 PG_MODULE_MAGIC;
 
 void       _PG_init(void);
 static void bdr_worker_shmem_startup(void);
-static void bdr_worker_shmem_create_workers(void);
 
 PGDLLEXPORT Datum bdr_apply_pause(PG_FUNCTION_ARGS);
 PGDLLEXPORT Datum bdr_apply_resume(PG_FUNCTION_ARGS);
@@ -123,7 +124,7 @@ PG_FUNCTION_INFO_V1(bdr_min_remote_version_num);
 PG_FUNCTION_INFO_V1(bdr_variant);
 PG_FUNCTION_INFO_V1(bdr_get_local_nodeid);
 
-static void
+void
 bdr_sigterm(SIGNAL_ARGS)
 {
    int         save_errno = errno;
@@ -144,7 +145,7 @@ bdr_sigterm(SIGNAL_ARGS)
    errno = save_errno;
 }
 
-static void
+void
 bdr_sighup(SIGNAL_ARGS)
 {
    int         save_errno = errno;
@@ -212,7 +213,7 @@ bdr_get_remote_dboid(const char *conninfo_db)
  *
  * The replication identifier is allocated in the current memory context.
  */
-void
+static void
 bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
        Oid remote_dboid, char **out_replication_identifier,
        Name out_slot_name)
@@ -438,7 +439,7 @@ bdr_worker_init(char *dbname)
    /* make sure BDR extension exists */
    bdr_executor_always_allow_writes(true);
    StartTransactionCommand();
-   bdr_maintain_schema();
+   bdr_maintain_schema(true);
    CommitTransactionCommand();
    bdr_executor_always_allow_writes(false);
 
@@ -570,215 +571,6 @@ bdr_establish_connection_and_slot(const char *dsn,
    return streamConn;
 }
 
-/*
- * In postmaster, at shared_preload_libaries time, create the GUCs for a
- * connection. They'll be accessed by the apply worker that uses these GUCs
- * later.
- *
- * Returns false if the config wasn't created for some reason (missing
- * required options, etc); true if it's ok. Out parameters are not changed if
- * false is returned.
- *
- * Params:
- *
- *  name
- *  Name of this conn - bdr.<name>
- *
- *  used_databases
- *  Array of char*, names of distinct databases named in configured conns
- *
- *  num_used_databases
- *  Number of distinct databases named in conns
- *
- * out_config
- *  Assigned a palloc'd pointer to GUC storage for this config'd connection
- *
- * out_config is set even if false is returned, as the GUCs have still been
- * created. Test out_config->is_valid to see whether the connection is usable.
- */
-static bool
-bdr_create_con_gucs(char  *name,
-                   char **used_databases,
-                   Size  *num_used_databases,
-                   char **database_initcons,
-                   BdrConnectionConfig **out_config)
-{
-   Size        off;
-   char       *errormsg = NULL;
-   PQconninfoOption *options;
-   PQconninfoOption *cur_option;
-   BdrConnectionConfig *opts;
-
-   /* don't free, referenced by the guc machinery! */
-   char       *optname_dsn = palloc(strlen(name) + 30);
-   char       *optname_delay = palloc(strlen(name) + 30);
-   char       *optname_replica = palloc(strlen(name) + 30);
-   char       *optname_local_dsn = palloc(strlen(name) + 30);
-   char       *optname_local_dbname = palloc(strlen(name) + 30);
-   char       *optname_replication_sets = palloc(strlen(name) + 30);
-
-   Assert(process_shared_preload_libraries_in_progress);
-
-   /* Ensure the connection name is legal */
-   if (strchr(name, '_') != NULL)
-   {
-       ereport(ERROR,
-               (errmsg("bdr.connections entry '%s' contains the '_' character, which is not permitted", name)));
-   }
-
-   /* allocate storage for connection parameters */
-   opts = palloc0(sizeof(BdrConnectionConfig));
-   opts->is_valid = false;
-   *out_config = opts;
-
-   opts->name = pstrdup(name);
-
-   /* Define GUCs for this connection */
-   sprintf(optname_dsn, "bdr.%s_dsn", name);
-   DefineCustomStringVariable(optname_dsn,
-                              optname_dsn,
-                              NULL,
-                              &opts->dsn,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_delay, "bdr.%s_apply_delay", name);
-   DefineCustomIntVariable(optname_delay,
-                           optname_delay,
-                           NULL,
-                           &opts->apply_delay,
-                           -1, -1, INT_MAX,
-                           PGC_SIGHUP,
-                           GUC_UNIT_MS,
-                           NULL, NULL, NULL);
-
-   sprintf(optname_replica, "bdr.%s_init_replica", name);
-   DefineCustomBoolVariable(optname_replica,
-                            optname_replica,
-                            NULL,
-                            &opts->init_replica,
-                            false,
-                            PGC_SIGHUP,
-                            0,
-                            NULL, NULL, NULL);
-
-   sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
-   DefineCustomStringVariable(optname_local_dsn,
-                              optname_local_dsn,
-                              NULL,
-                              &opts->replica_local_dsn,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
-   DefineCustomStringVariable(optname_local_dbname,
-                              optname_local_dbname,
-                              NULL,
-                              &opts->dbname,
-                              NULL, PGC_POSTMASTER,
-                              GUC_NOT_IN_SAMPLE,
-                              NULL, NULL, NULL);
-
-   sprintf(optname_replication_sets, "bdr.%s_replication_sets", name);
-   DefineCustomStringVariable(optname_replication_sets,
-                              optname_replication_sets,
-                              NULL,
-                              &opts->replication_sets,
-                              NULL, PGC_POSTMASTER,
-                              GUC_LIST_INPUT | GUC_LIST_QUOTE,
-                              NULL, NULL, NULL);
-
-
-   if (!opts->dsn)
-   {
-       elog(WARNING, "bdr %s: no connection information", name);
-       return false;
-   }
-
-   elog(DEBUG2, "bdr %s: dsn=%s", name, opts->dsn);
-
-   options = PQconninfoParse(opts->dsn, &errormsg);
-   if (errormsg != NULL)
-   {
-       char       *str = pstrdup(errormsg);
-
-       PQfreemem(errormsg);
-       ereport(ERROR,
-               (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                errmsg("bdr %s: error in dsn: %s", name, str)));
-   }
-
-   if (opts->dbname == NULL)
-   {
-       cur_option = options;
-       while (cur_option->keyword != NULL)
-       {
-           if (strcmp(cur_option->keyword, "dbname") == 0)
-           {
-               if (cur_option->val == NULL)
-                   ereport(ERROR,
-                           (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                            errmsg("bdr %s: no dbname set", name)));
-
-               opts->dbname = pstrdup(cur_option->val);
-               elog(DEBUG2, "bdr %s: dbname=%s", name, opts->dbname);
-           }
-
-           if (cur_option->val != NULL)
-           {
-               elog(DEBUG3, "bdr %s: opt %s, val: %s",
-                    name, cur_option->keyword, cur_option->val);
-           }
-           cur_option++;
-       }
-   }
-
-   /* cleanup */
-   PQconninfoFree(options);
-
-   /*
-    * If this is a DB name we haven't seen yet, add it to our set of known
-    * DBs.
-    */
-   for (off = 0; off < *num_used_databases; off++)
-   {
-       if (strcmp(opts->dbname, used_databases[off]) == 0)
-           break;
-   }
-
-   if (off == *num_used_databases)
-   {
-       /* Didn't find a match, add new db name */
-       used_databases[(*num_used_databases)++] =
-           pstrdup(opts->dbname);
-       elog(DEBUG2, "bdr %s: Saw new database %s, now %i known dbs",
-            name, opts->dbname, (int)(*num_used_databases));
-   }
-
-   /*
-    * Make sure that at most one of the worker configs for each DB can be
-    * configured to run initialization.
-    */
-   if (opts->init_replica)
-   {
-       elog(DEBUG2, "bdr %s: has init_replica=t", name);
-       if (database_initcons[off] != NULL)
-           ereport(ERROR,
-                   (errcode(ERRCODE_CONFIG_FILE_ERROR),
-                    errmsg("Connections %s and %s on database %s both have bdr_init_replica enabled, cannot continue",
-                           name, database_initcons[off], used_databases[off])));
-       else
-           database_initcons[off] = name; /* no need to pstrdup, see _PG_init */
-   }
-
-   opts->is_valid = true;
-
-   /* optname vars intentionally leaked, see above */
-   return true;
-}
-
 static size_t
 bdr_worker_shmem_size()
 {
@@ -846,16 +638,35 @@ bdr_worker_shmem_startup(void)
        /* Init shm segment header after postmaster start or restart */
        memset(BdrWorkerCtl, 0, bdr_worker_shmem_size());
        BdrWorkerCtl->lock = LWLockAssign();
+       /* Assigned on supervisor launch */
+       BdrWorkerCtl->supervisor_latch = NULL;
 
        /*
-        * Now that the shm segment is initialized, we can populate it with
-        * BdrWorker entries for the connections we created GUCs for during
-        * _PG_init.
+        * The postmaster keeps track of a generation number for BDR workers
+        * and increments it at each restart.
+        *
+        * Background workers aren't unregistered when the postmaster restarts
+        * and clears shared memory, so after a restart the supervisor and
+        * per-db workers have no idea what workers are/aren't running, nor any
+        * way to control them. To make a clean BDR restart possible the
+        * workers registered before the restart need to find out about the
+        * restart and terminate.
+        *
+        * To make that possible we pass the generation number to the worker
+        * in its main argument, and also set it in shared memory. The two
+        * must match. If they don't, the worker will proc_exit(0), causing its
+        * self to be unregistered.
         *
-        * We must do this whether it's initial launch or a postmaster restart,
-        * as shmem gets cleared on postmaster restart.
+        * This should really be part of the bgworker API its self, handled via
+        * a BGW_NO_RESTART_ON_CRASH flag or by providing a generation number
+        * as a bgworker argument. However, for now we're stuck with this
+        * workaround.
         */
-       bdr_worker_shmem_create_workers();
+       if (bdr_worker_generation == UINT16_MAX)
+           /* We could handle wrap-around, but really ... */
+           elog(FATAL, "Too many postmaster crash/restart cycles. Restart the PostgreSQL server.");
+
+       BdrWorkerCtl->worker_generation = ++bdr_worker_generation;
    }
    LWLockRelease(AddinShmemInitLock);
 
@@ -865,145 +676,6 @@ bdr_worker_shmem_startup(void)
     */
 }
 
-/*
- * After _PG_init we've read the GUCs for the workers but haven't populated the
- * shared memory segment at BdrWorkerCtl with BDRWorker entries yet.
- *
- * The shm segment is initialized now, so do that.
- */
-static void
-bdr_worker_shmem_create_workers(void)
-{
-   uint32 off;
-
-   /*
-    * Create a BdrPerdbWorker for each distinct database found during
-    * _PG_init. The bgworker for each has already been registered and assigned
-    * a slot position during _PG_init, but the slot doesn't have anything
-    * useful in it yet. Because it was already registered we don't need
-    * any protection against duplicate launches on restart here.
-    *
-    * Because these slots are pre-assigned before shmem is bought up they
-    * MUST be reserved first, before any shmem entries are allocated, so
-    * they get the first slots.
-    *
-    * When started, this worker will continue setup - doing any required
-    * initialization of the database, then registering dynamic bgworkers for
-    * the DB's individual BDR connections.
-    *
-    * If we ever want to support dynamically adding/removing DBs from BDR at
-    * runtime, this'll need to move into a static bgworker because dynamic
-    * bgworkers can't be launched directly from the postmaster. We'll need a
-    * "bdr manager" static bgworker.
-    */
-
-   for (off = 0; off < bdr_distinct_dbnames_count; off++)
-   {
-       BdrWorker      *shmworker;
-       BdrPerdbWorker *perdb;
-       uint32      ctl_idx;
-
-       shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_PERDB, &ctl_idx);
-       Assert(shmworker->worker_type == BDR_WORKER_PERDB);
-       /*
-        * The workers have already been assigned shmem indexes during
-        * _PG_init, so they MUST get the same index here. So long as these
-        * entries are assigned before any other shmem slots they will.
-        */
-       Assert(ctl_idx == off);
-       perdb = &shmworker->data.perdb;
-
-       strncpy(NameStr(perdb->dbname), bdr_distinct_dbnames[off], NAMEDATALEN);
-       NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
-
-       perdb->nnodes = 0;
-       perdb->seq_slot = off;
-
-       elog(DEBUG1, "Assigning shmem bdr database worker for db %s",
-            NameStr(perdb->dbname));
-   }
-
-   /*
-    * Populate shmem with a BdrApplyWorker for each valid BdrConnectionConfig
-    * found during _PG_init so that the per-db worker will register it for
-    * startup after performing any BDR initialisation work.
-    *
-    * Use of shared memory for this is required for EXEC_BACKEND (windows)
-    * where we can't share postmaster memory, and for when we're launching a
-    * bgworker from another bgworker where the fork() from postmaster doesn't
-    * provide access to the launching bgworker's memory.
-    *
-    * The workers aren't actually launched here, they get launched by
-    * launch_apply_workers(), called by the database's per-db static worker.
-    */
-   for (off = 0; off < bdr_max_workers; off++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[off];
-       BdrWorker      *shmworker;
-       BdrApplyWorker *worker;
-       int             i;
-       bool            found_perdb = false;
-
-       if (cfg == NULL || !cfg->is_valid)
-           continue;
-
-       shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_APPLY, NULL);
-       Assert(shmworker->worker_type == BDR_WORKER_APPLY);
-       worker = &shmworker->data.apply;
-       worker->connection_config_idx = off;
-       worker->replay_stop_lsn = InvalidXLogRecPtr;
-       worker->forward_changesets = false;
-
-       /*
-        * Now search for the perdb worker belonging to this slot.
-        */
-       for (i = 0; i < bdr_max_workers; i++)
-       {
-           BdrPerdbWorker *perdb;
-           BdrWorker *entry = &BdrWorkerCtl->slots[i];
-
-           if (entry->worker_type != BDR_WORKER_PERDB)
-               continue;
-
-           perdb = &entry->data.perdb;
-
-           if (strcmp(NameStr(perdb->dbname), cfg->dbname) != 0)
-               continue;
-
-           /*
-            * Remember how many connections there are for this node. This
-            * will, e.g., be used to determine the quorum for ddl locks and
-            * sequencer votes.
-            */
-           perdb->nnodes++;
-           found_perdb = true;
-           worker->perdb_worker_off = i;
-           break;
-       }
-
-       if (!found_perdb)
-           elog(ERROR, "couldn't find perdb entry for apply worker");
-
-       /*
-        * If this is a postmaster restart, don't register the worker a second
-        * time when the per-db worker starts up.
-        */
-       worker->bgw_is_registered = bdr_is_restart;
-   }
-
-   /*
-    * Make sure that we don't register workers if the postmaster restarts and
-    * clears shmem, by keeping a record that we've asked for registration once
-    * already.
-    */
-   bdr_is_restart = true;
-
-   /*
-    * We might need to re-populate shared memory after a postmaster restart.
-    * So we don't free the bdr_startup_context or its contents.
-    */
-}
-
 
 /*
  * Allocate a block from the bdr_worker shm segment in BdrWorkerCtl, or ERROR
@@ -1014,12 +686,16 @@ bdr_worker_shmem_create_workers(void)
  * ctl_idx, if passed, is set to the index of the worker within BdrWorkerCtl.
  *
  * To release a block, use bdr_worker_shmem_release(...)
+ *
+ * You must hold BdrWorkerCtl->lock in LW_EXCLUSIVE mode for
+ * this call.
  */
 BdrWorker*
 bdr_worker_shmem_alloc(BdrWorkerType worker_type, uint32 *ctl_idx)
 {
    int i;
-   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
    for (i = 0; i < bdr_max_workers; i++)
    {
        BdrWorker *new_entry = &BdrWorkerCtl->slots[i];
@@ -1027,13 +703,11 @@ bdr_worker_shmem_alloc(BdrWorkerType worker_type, uint32 *ctl_idx)
        {
            memset(new_entry, 0, sizeof(BdrWorker));
            new_entry->worker_type = worker_type;
-           LWLockRelease(BdrWorkerCtl->lock);
            if (ctl_idx)
                *ctl_idx = i;
            return new_entry;
        }
    }
-   LWLockRelease(BdrWorkerCtl->lock);
    ereport(ERROR,
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
            errmsg("No free bdr worker slots - bdr.max_workers is too low")));
@@ -1127,17 +801,7 @@ bdr_do_not_replicate_assign_hook(bool newvalue, void *extra)
 void
 _PG_init(void)
 {
-   List       *connames;
-   ListCell   *c;
    MemoryContext old_context;
-   char       *connections_tmp;
-
-   char      **used_databases;
-   char      **database_initcons;
-   Size        num_used_databases = 0;
-   int         connection_config_idx;
-   BackgroundWorker bgw;
-   uint32      off;
 
    if (!process_shared_preload_libraries_in_progress)
        ereport(ERROR,
@@ -1151,6 +815,15 @@ _PG_init(void)
                 errmsg("bdr requires \"track_commit_timestamp\" to be enabled")));
 #endif
 
+   /*
+    * _PG_init only runs on first load, not on postmaster restart, so
+    * set the worker generation here. See bdr_worker_shmem_startup.
+    *
+    * It starts at 1 because the postmaster zeroes shmem on restart, so 0 can
+    * mean "just restarted, hasn't run shmem setup callback yet".
+    */
+   bdr_worker_generation = 1;
+
    /*
     * Force btree_gist to be loaded - its absolutely not required at this
     * point, but since it's required for BDR to be used it's much easier to
@@ -1213,10 +886,19 @@ _PG_init(void)
     * memory array.
     */
    DefineCustomIntVariable("bdr.max_workers",
-                           "max number of bdr connections + distinct databases. -1 auto-calculates.",
+                           "max number of bdr connections + distinct databases.",
                            NULL,
                            &bdr_max_workers,
-                           -1, -1, 100,
+                           20, 2, 100,
+                           PGC_POSTMASTER,
+                           0,
+                           NULL, NULL, NULL);
+
+   DefineCustomIntVariable("bdr.max_databases",
+                           "max number of distinct databases on which BDR may be active",
+                           NULL,
+                           &bdr_max_databases,
+                           -1, -1, 50,
                            PGC_POSTMASTER,
                            0,
                            NULL, NULL, NULL);
@@ -1271,15 +953,6 @@ _PG_init(void)
                               0,
                               NULL, NULL, NULL);
 
-   DefineCustomBoolVariable("bdr.init_from_basedump",
-                            "Internal. Set during local initialization from basebackup only",
-                            NULL,
-                            &bdr_init_from_basedump,
-                            false,
-                            PGC_BACKEND,
-                            0,
-                            NULL, NULL, NULL);
-
    DefineCustomBoolVariable("bdr.do_not_replicate",
                             "Internal. Set during local initialization from basebackup only",
                             NULL,
@@ -1293,40 +966,7 @@ _PG_init(void)
 
    bdr_label_init();
 
-   /* if nothing is configured, we're done */
-   if (connections == NULL)
-   {
-       /* If worker count autoconfigured, use zero */
-       if (bdr_max_workers == -1)
-           bdr_max_workers = 0;
-       goto out;
-   }
-
-   /* Copy 'connections' guc so SplitIdentifierString can modify it in-place */
-   connections_tmp = pstrdup(connections);
-
-   /* Get the list of BDR connection names to iterate over. */
-   if (!SplitIdentifierString(connections_tmp, ',', &connames))
-   {
-       /* syntax error in list */
-       ereport(FATAL,
-               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                errmsg("invalid list syntax for \"bdr.connections\"")));
-   }
-
-   /*
-    * If bdr.max_connections is -1, the default, auto-set it with the
-    * most workers we might need with the current number of connections
-    * configured. Per-db workers are due to use shmem too, so we might
-    * have up to one per-db worker for each configured connection if
-    * each is on a different DB.
-    */
-   if (bdr_max_workers == -1)
-   {
-       bdr_max_workers = list_length(connames) * 3;
-       elog(DEBUG1, "bdr: bdr_max_workers unset, configuring for %d workers",
-               bdr_max_workers);
-   }
+   bdr_supervisor_register();
 
    /*
     * Sanity check max_worker_processes to make sure it's at least big enough
@@ -1342,6 +982,17 @@ _PG_init(void)
                 errhint("Set max_worker_processes to at least %d", bdr_max_workers)));
    }
 
+   /*
+    * If bdr.max_databases is not explicitly specified, assume the worst case
+    * of many DBs with one connection per DB.
+    */
+   if (bdr_max_databases == -1)
+   {
+       bdr_max_databases = bdr_max_workers / 2;
+       elog(DEBUG1, "Autoconfiguring bdr.max_databases to %d (bdr.max_workers/2)",
+            bdr_max_databases);
+   }
+
    /*
     * Allocate a shared memory segment to store the bgworker connection
     * information we must pass to each worker we launch.
@@ -1352,110 +1003,19 @@ _PG_init(void)
     */
    bdr_worker_alloc_shmem_segment();
 
-   /* Allocate space for BDR connection GUCs */
-   bdr_connection_configs = (BdrConnectionConfig**)
-       palloc0(bdr_max_workers * sizeof(BdrConnectionConfig*));
-
-   /* Names of all databases we're going to be doing BDR for */
-   used_databases = palloc0(sizeof(char *) * list_length(connames));
-   /*
-    * For each db named in used_databases, the corresponding index is the name
-    * of the conn with bdr_init_replica=t if any.
-    */
-   database_initcons = palloc0(sizeof(char *) * list_length(connames));
-
-   /*
-    * Read all connections, create/validate parameters for them and do sanity
-    * checks as we go.
-    */
-   connection_config_idx = 0;
-   foreach(c, connames)
-   {
-       char           *name;
-       name = (char *) lfirst(c);
-
-       if (!bdr_create_con_gucs(name, used_databases, &num_used_databases,
-                                database_initcons,
-                                &bdr_connection_configs[connection_config_idx]))
-           continue;
-
-       Assert(bdr_connection_configs[connection_config_idx] != NULL);
-       connection_config_idx++;
-   }
-
-   /*
-    * Free the connames list cells. The strings are just pointers into
-    * 'connections' and must not be freed'd.
-    */
-   list_free(connames);
-   connames = NIL;
-
-   /*
-    * We've ensured there are no duplicate init connections, no need to
-    * remember which conn is the bdr_init_replica conn anymore. The contents
-    * are just pointers into connections_tmp so we don't want to free them.
-    */
-   pfree(database_initcons);
-
-   /*
-    * Copy the list of used databases into a global where we can
-    * use it for registering the per-database workers during shmem init.
-    */
-   bdr_distinct_dbnames = palloc(sizeof(char*)*num_used_databases);
-   memcpy(bdr_distinct_dbnames, used_databases,
-          sizeof(char*)*num_used_databases);
-   bdr_distinct_dbnames_count = num_used_databases;
-   pfree(used_databases);
-   num_used_databases = 0;
-   used_databases = NULL;
-
-   /*
-    * Register the per-db workers and assign them an index in shmem. The
-    * memory doesn't actually exist yet, it'll be allocated in shmem init.
-    *
-    * No protection against multiple launches is requried because this
-    * only runs once, in _PG_init.
-    */
-   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
-       BGWORKER_BACKEND_DATABASE_CONNECTION;
-   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-   bgw.bgw_main = NULL;
-   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
-   strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
-   bgw.bgw_restart_time = 5;
-   bgw.bgw_notify_pid = 0;
-   for (off = 0; off < bdr_distinct_dbnames_count; off++)
-   {
-       snprintf(bgw.bgw_name, BGW_MAXLEN,
-                "bdr: %s", bdr_distinct_dbnames[off]);
-       /*
-        * This index into BdrWorkerCtl shmem hasn't been populated yet. It'll
-        * be set up in bdr_worker_shmem_create_workers .
-        */
-       bgw.bgw_main_arg = Int32GetDatum(off);
-       RegisterBackgroundWorker(&bgw);
-   }
-
    EmitWarningsOnPlaceholders("bdr");
 
-   pfree(connections_tmp);
-
-out:
-
    /*
     * initialize other modules that need shared memory
-    *
-    * Do so even if we haven't any remote nodes setup, the shared memory might
-    * still be needed for some sql callable functions or such.
     */
 
    /* register a slot for every remote node */
    bdr_count_shmem_init(bdr_max_workers);
    bdr_executor_init();
 #ifdef BUILDING_BDR
-   bdr_sequencer_shmem_init(bdr_max_workers, bdr_distinct_dbnames_count);
+   bdr_sequencer_shmem_init(bdr_max_workers, bdr_max_databases);
 #endif
-   bdr_locks_shmem_init(bdr_distinct_dbnames_count);
+   bdr_locks_shmem_init();
    /* Set up a ProcessUtility_hook to stop unsupported commands being run */
    init_bdr_commandfilter();
 
@@ -1483,9 +1043,12 @@ bdr_lookup_relid(const char *relname, Oid schema_oid)
  * Concurrent executions will block, but not fail.
  *
  * Must be called inside transaction.
+ *
+ * If update_extensions is true, ALTER EXTENSION commands will be issued to
+ * ensure the required extension(s) are at the current version.
  */
 void
-bdr_maintain_schema(void)
+bdr_maintain_schema(bool update_extensions)
 {
    Relation    extrel;
    Oid         btree_gist_oid;
@@ -1504,17 +1067,13 @@ bdr_maintain_schema(void)
    btree_gist_oid = get_extension_oid("btree_gist", true);
    bdr_oid = get_extension_oid("bdr", true);
 
-   /* create required extension if they don't exists yet */
    if (btree_gist_oid == InvalidOid)
-   {
-       CreateExtensionStmt create_stmt;
+       elog(ERROR, "btree_gist is required by BDR but not installed in the current database");
 
-       create_stmt.if_not_exists = false;
-       create_stmt.options = NIL;
-       create_stmt.extname = (char *)"btree_gist";
-       CreateExtension(&create_stmt);
-   }
-   else
+   if (bdr_oid == InvalidOid)
+       elog(ERROR, "bdr extension is not installed in the current database");
+
+   if (update_extensions)
    {
        AlterExtensionStmt alter_stmt;
 
@@ -1522,20 +1081,6 @@ bdr_maintain_schema(void)
        alter_stmt.options = NIL;
        alter_stmt.extname = (char *)"btree_gist";
        ExecAlterExtensionStmt(&alter_stmt);
-   }
-
-   if (bdr_oid == InvalidOid)
-   {
-       CreateExtensionStmt create_stmt;
-
-       create_stmt.if_not_exists = false;
-       create_stmt.options = NIL;
-       create_stmt.extname = (char *)"bdr";
-       CreateExtension(&create_stmt);
-   }
-   else
-   {
-       AlterExtensionStmt alter_stmt;
 
        /* TODO: only do this if necessary */
        alter_stmt.options = NIL;
index 8673e3976b2f0ea55967216fb26c8769b419e1ed..b3faa319e2cc5dafa75683be3ff66bff7954b758 100644 (file)
@@ -1,6 +1,6 @@
 # bdr extension
 comment = 'Bi-directional replication for PostgreSQL'
-default_version = '0.9.0.0'
+default_version = '0.9.0.1'
 module_pathname = '$libdir/bdr'
 relocatable = false
 requires = btree_gist
diff --git a/bdr.h b/bdr.h
index 566a77e7f48075905173edf9a930f834d642aeb0..12440f2ff855963e728558fb7bf8ba4073f4a899 100644 (file)
--- a/bdr.h
+++ b/bdr.h
@@ -14,6 +14,7 @@
 #include "postmaster/bgworker.h"
 #include "replication/logical.h"
 #include "utils/resowner.h"
+#include "storage/latch.h"
 #include "storage/lock.h"
 
 #include "libpq-fe.h"
@@ -154,11 +155,16 @@ typedef struct BDRTupleData
  */
 typedef struct BdrApplyWorker
 {
+   /* oid of the database this worker is applying changes to */
+   Oid dboid;
+
    /*
-    * Index in bdr_connection_configs of this workers's GUCs
-    * and config info (including dbname, name, etc).
+    * Identification for the remote db we're connecting to; used to
+    * find the appropriate bdr.connections row, etc.
     */
-   int connection_config_idx;
+   uint64      remote_sysid;
+   TimeLineID  remote_timeline;
+   Oid         remote_dboid;
 
    /*
     * If not InvalidXLogRecPtr, stop replay at this point and exit.
@@ -170,15 +176,6 @@ typedef struct BdrApplyWorker
 
    /* Request that the remote forward all changes from other nodes */
    bool forward_changesets;
-
-   /*
-    * Ensure this worker doesn't get registered a second time if there's a
-    * perdb worker restart or postmaster restart. Ideally we'd store the
-    * BackgroundWorkerHandle, but it's an opaque struct.
-    */
-   bool bgw_is_registered;
-
-   size_t perdb_worker_off;
 } BdrApplyWorker;
 
 /*
@@ -187,18 +184,26 @@ typedef struct BdrApplyWorker
  */
 typedef struct BdrPerdbWorker
 {
-   /* local database name */
+   /* local database name to connect to */
    NameData dbname;
 
    /* number of outgoing connections from this database */
-   size_t nnodes;
+   Size nnodes;
 
    size_t seq_slot;
 
+   /* The perdb worker's latch from the PROC array, for use from other backends */
+   Latch      *proclatch;
+
+   /* Oid of the database the worker is attached to - populated after start */
+   Oid database_oid;
 } BdrPerdbWorker;
 
 /*
  * Type of BDR worker in a BdrWorker struct
+ *
+ * Note that the supervisor worker doesn't appear here, it has its own
+ * dedicated entry in the shmem segment.
  */
 typedef enum {
    /*
@@ -206,7 +211,7 @@ typedef enum {
     * it's set by memset(...) during shm segment init.
     */
    BDR_WORKER_EMPTY_SLOT = 0,
-   /* This shm array slot contains data for a */
+   /* This shm array slot contains data for a BdrApplyWorker */
    BDR_WORKER_APPLY,
    /* This is data for a per-database worker BdrPerdbWorker */
    BDR_WORKER_PERDB,
@@ -235,18 +240,11 @@ typedef struct BdrWorker
 
 } BdrWorker;
 
-/*
- * Params for every connection in bdr.connections.
- *
- * Contains n=bdr_max_workers elements, may have NULL entries.
- */
-extern BdrConnectionConfig **bdr_connection_configs;
-
 /* GUCs */
 extern int bdr_default_apply_delay;
 extern int bdr_max_workers;
+extern int bdr_max_databases;
 extern char *bdr_temp_dump_directory;
-extern bool bdr_init_from_basedump;
 extern bool bdr_log_conflicts_to_table;
 extern bool bdr_conflict_logging_include_tuples;
 extern bool bdr_permit_unsafe_commands;
@@ -263,13 +261,20 @@ typedef struct BdrWorkerControl
 {
    /* Must hold this lock when writing to BdrWorkerControl members */
    LWLockId     lock;
+   /* Worker generation number, incremented on postmaster restart */
+   uint16       worker_generation;
    /* Set/unset by bdr_apply_pause()/_replay(). */
    bool         pause_apply;
+   /* Is this the first startup of the supervisor? */
+   bool         is_supervisor_restart;
+   /* Latch for the supervisor worker */
+   Latch       *supervisor_latch;
    /* Array members, of size bdr_max_workers */
    BdrWorker    slots[FLEXIBLE_ARRAY_MEMBER];
 } BdrWorkerControl;
 
 extern BdrWorkerControl *BdrWorkerCtl;
+extern BdrWorker       *bdr_worker_slot;
 
 extern ResourceOwner bdr_saved_resowner;
 
@@ -294,8 +299,25 @@ extern Oid BdrLocksByOwnerRelid;
 
 extern Oid  BdrReplicationSetConfigRelid;
 
+/* Structure representing bdr_nodes record */
+typedef struct BDRNodeInfo
+{
+   /* ID */
+   uint64      sysid;
+   TimeLineID  timeline;
+   Oid         dboid;
+
+   char        status;
+
+   char       *local_dsn;
+   char       *init_from_dsn;
+} BDRNodeInfo;
+
 extern Oid bdr_lookup_relid(const char *relname, Oid schema_oid);
 
+extern void bdr_sequencer_set_nnodes(Size nnodes);
+
+
 /* apply support */
 extern void bdr_fetch_sysid_via_node_id(RepNodeId node_id, uint64 *sysid,
                                        TimeLineID *tli, Oid *remote_dboid);
@@ -385,8 +407,11 @@ PGDLLEXPORT extern Datum bdr_sequence_setval(PG_FUNCTION_ARGS);
 PGDLLEXPORT extern Datum bdr_sequence_options(PG_FUNCTION_ARGS);
 #endif
 
+extern int bdr_sequencer_get_next_free_slot(void); //XXX PERDB temp
+
+
 /* statistic functions */
-extern void bdr_count_shmem_init(size_t nnodes);
+extern void bdr_count_shmem_init(Size nnodes);
 extern void bdr_count_set_current_node(RepNodeId node_id);
 extern void bdr_count_commit(void);
 extern void bdr_count_rollback(void);
@@ -405,10 +430,10 @@ extern bool bdr_get_integer_timestamps(void);
 extern bool bdr_get_bigendian(void);
 
 /* initialize a new bdr member */
-extern void bdr_init_replica(Name dbname);
+extern void bdr_init_replica(BDRNodeInfo *local_node);
 
 /* shared memory management */
-extern void bdr_maintain_schema(void);
+extern void bdr_maintain_schema(bool update_extensions);
 extern BdrWorker* bdr_worker_shmem_alloc(BdrWorkerType worker_type,
                                         uint32 *ctl_idx);
 extern void bdr_worker_shmem_release(BdrWorker* worker, BackgroundWorkerHandle *handle);
@@ -423,20 +448,35 @@ extern void bdr_executor_always_allow_writes(bool always_allow);
 extern void bdr_queue_ddl_command(char *command_tag, char *command);
 extern void bdr_execute_ddl_command(char *cmdstr, char *perpetrator, bool tx_just_started);
 
-extern void bdr_locks_shmem_init(Size num_used_databases);
+extern void bdr_locks_shmem_init(void);
 extern void bdr_locks_check_query(void);
 
-/* background workers */
-extern void bdr_worker_init(char* dbname);
+/* background workers and supporting functions for them */
 PGDLLEXPORT extern void bdr_apply_main(Datum main_arg);
 PGDLLEXPORT extern void bdr_perdb_worker_main(Datum main_arg);
+PGDLLEXPORT extern void bdr_supervisor_worker_main(Datum main_arg);
+
+extern void bdr_worker_init(char* dbname);
+extern void bdr_supervisor_register(void);
+
+extern void bdr_sighup(SIGNAL_ARGS);
+extern void bdr_sigterm(SIGNAL_ARGS);
+
+extern int find_perdb_worker_slot(Oid dboid,
+                                    BdrWorker **worker_found);
+
+extern void bdr_launch_apply_workers(Oid dboid);
 
 /* Information functions */
 extern int bdr_parse_version(const char * bdr_version_str, int *o_major,
                             int *o_minor, int *o_rev, int *o_subrev);
 
 /* manipulation of bdr catalogs */
-extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid);
+extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli,
+                                      Oid dboid);
+extern BDRNodeInfo * bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli,
+                                         Oid dboid);
+extern void bdr_bdr_node_free(BDRNodeInfo *node);
 extern void bdr_nodes_set_local_status(char status);
 
 extern Oid GetSysCacheOidError(int cacheId, Datum key1, Datum key2, Datum key3,
@@ -463,7 +503,8 @@ bdr_copytable(PGconn *copyfrom_conn, PGconn *copyto_conn,
 
 /* helpers shared by multiple worker types */
 extern struct pg_conn* bdr_connect(const char *conninfo, Name appname,
-                                  uint64* remote_sysid_i, TimeLineID *remote_tlid_i,
+                                  uint64* remote_sysid_i,
+                                  TimeLineID *remote_tlid_i,
                                   Oid *out_dboid_i);
 
 extern struct pg_conn *
@@ -474,11 +515,7 @@ bdr_establish_connection_and_slot(const char *dsn,
                                  TimeLineID *out_timeline,
                                  Oid *out_dboid,
                                  RepNodeId *out_replication_identifier,
-                                 char **out_snapshot);
-extern void
-bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
-       Oid remote_dboid, char **out_replication_identifier,
-       Name out_slot_name);
+                                 char **out_snapshot);
 
 extern PGconn* bdr_connect_nonrepl(const char *connstring,
        const char *appnamesuffix);
index f901f488089253961d3466e4b7ba98ff71ffd7c4..a6fe91b214ab98619d219a948750f3d8ab86e29d 100644 (file)
@@ -35,6 +35,8 @@
 #include "catalog/namespace.h"
 #include "catalog/pg_type.h"
 
+#include "executor/spi.h"
+
 #include "libpq/pqformat.h"
 
 #include "mb/pg_wchar.h"
@@ -97,16 +99,10 @@ static RepNodeId        remote_origin_id = InvalidRepNodeId;
 /*
  * This code only runs within an apply bgworker, so we can stash a pointer to our
  * state in shm in a global for convenient access.
- *
- * TODO: make static once bdr_apply_main moved into bdr.c
  */
-BdrApplyWorker *bdr_apply_worker = NULL;
+static BdrApplyWorker *bdr_apply_worker = NULL;
 
-/*
- * GUCs for this apply worker - again, this is fixed for the lifetime of the
- * worker so we can stash it in a global.
- */
-BdrConnectionConfig *bdr_apply_config = NULL;
+static BdrConnectionConfig *bdr_apply_config = NULL;
 
 dlist_head bdr_lsn_association = DLIST_STATIC_INIT(bdr_lsn_association);
 
@@ -195,8 +191,7 @@ process_remote_begin(StringInfo s)
    replication_origin_xid = remote_xid;
 
    snprintf(statbuf, sizeof(statbuf),
-           "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %s, %X/%X, %s",
-            bdr_apply_config->name,
+           "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %X/%X, %s",
            (uint32) (origlsn >> 32), (uint32) origlsn,
            timestamptz_to_str(committime));
 
@@ -381,8 +376,7 @@ process_remote_commit(StringInfo s)
            && bdr_apply_worker->replay_stop_lsn <= end_lsn)
    {
        ereport(LOG,
-               (errmsg("bdr apply %s finished processing; replayed to %X/%X of required %X/%X",
-                bdr_apply_config->name,
+               (errmsg("bdr apply finished processing; replayed to %X/%X of required %X/%X",
                 (uint32)(end_lsn>>32), (uint32)end_lsn,
                 (uint32)(bdr_apply_worker->replay_stop_lsn>>32), (uint32)bdr_apply_worker->replay_stop_lsn)));
        /*
@@ -2389,6 +2383,7 @@ bdr_apply_work(PGconn* streamConn)
    }
 }
 
+
 /*
  * Entry point for a BDR apply worker.
  *
@@ -2405,27 +2400,70 @@ bdr_apply_main(Datum main_arg)
    RepNodeId   replication_identifier;
    XLogRecPtr  start_from;
    NameData    slot_name;
-   BdrWorker  *bdr_worker_slot;
+   NameData    dbname;
+   BdrWorker  *perdb;
+   uint32      worker_arg;
+   uint16      apply_worker_idx,
+               worker_generation;
+   int         perdb_worker_idx;
 
    Assert(IsBackgroundWorker);
 
+   worker_arg = DatumGetInt32(main_arg);
+
+   worker_generation = (uint16)(worker_arg >> 16);
+   apply_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+   if (worker_generation != BdrWorkerCtl->worker_generation)
+   {
+       elog(DEBUG1, "apply worker from generation %d exiting after finding shmem generation is %d",
+            worker_generation, BdrWorkerCtl->worker_generation);
+       proc_exit(0);
+   }
+
    initStringInfo(&query);
 
-   bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+   bdr_worker_slot = &BdrWorkerCtl->slots[ apply_worker_idx ];
    Assert(bdr_worker_slot->worker_type == BDR_WORKER_APPLY);
    bdr_apply_worker = &bdr_worker_slot->data.apply;
    bdr_worker_type = BDR_WORKER_APPLY;
 
-   bdr_apply_config = bdr_connection_configs[bdr_apply_worker->connection_config_idx];
-   Assert(bdr_apply_config != NULL);
-
-   bdr_worker_init(bdr_apply_config->dbname);
+   /*
+    * Get the database name to connect to from the perdb worker for this db
+    *
+    * It'd be preferable to just connect by oid, but the bgworkers interface
+    * doesn't permit us to do that, and we can't look up the syscache to find
+    * the name by oid until we're connected.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
+   perdb_worker_idx = find_perdb_worker_slot(bdr_apply_worker->dboid, NULL);
+   Assert(perdb_worker_idx >= 0);
+   perdb = &BdrWorkerCtl->slots[perdb_worker_idx];
+   Assert(perdb->worker_type == BDR_WORKER_PERDB);
+   namecpy(&dbname, &perdb->data.perdb.dbname);
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   /* Then unblock signals, connect to the db, etc */
+   bdr_worker_init(NameStr(dbname));
+
+   Assert(MyDatabaseId == bdr_apply_worker->dboid);
+
+   /* Read our connection configuration from the database */
+   bdr_apply_config = bdr_get_connection_config(
+       bdr_apply_worker->remote_sysid,
+       bdr_apply_worker->remote_timeline,
+       bdr_apply_worker->remote_dboid,
+       false);
+
+   Assert(bdr_apply_config->sysid == bdr_apply_worker->remote_sysid &&
+          bdr_apply_config->timeline == bdr_apply_worker->remote_timeline &&
+          bdr_apply_config->dboid == bdr_apply_worker->remote_dboid);
 
    CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr apply top-level resource owner");
    bdr_saved_resowner = CurrentResourceOwner;
 
    elog(DEBUG1, "%s initialized on %s",
-        MyBgworkerEntry->bgw_name, bdr_apply_config->dbname);
+        MyBgworkerEntry->bgw_name, NameStr(dbname));
 
    /* Set our local application_name for our SPI connections */
    resetStringInfo(&query);
@@ -2504,6 +2542,8 @@ bdr_apply_main(Datum main_arg)
    appendStringInfo(&query, ", db_encoding '%s'", GetDatabaseEncodingName());
    if (bdr_apply_worker->forward_changesets)
        appendStringInfo(&query, ", forward_changesets 't'");
+   if (bdr_apply_config->is_unidirectional)
+       appendStringInfo(&query, ", unidirectional 't'");
 
    appendStringInfoChar(&query, ')');
 
index 5091747fa6d2ad586e457d848542d8d756ba2185..fb4b85078ae4d31dc2bb0db0d603e31aea158549 100644 (file)
 
 #include "utils/builtins.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 #include "utils/syscache.h"
 
+static int getattno(const char *colname);
+static char* bdr_textarr_to_identliststr(ArrayType *textarray);
+
+
 /* GetSysCacheOid equivalent that errors out if nothing is found */
 Oid
 GetSysCacheOidError(int cacheId,
@@ -70,7 +75,7 @@ bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid)
    Oid         argtypes[] = { TEXTOID, OIDOID, OIDOID };
    Datum       values[3];
    bool        isnull;
-   char        status;
+   char        status;
    char        sysid_str[33];
    Oid         schema_oid;
 
@@ -118,17 +123,108 @@ bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid)
 }
 
 /*
- * Insert a row for the local node's (sysid,tlid,dboid) with the passed status
- * into bdr.bdr_nodes. No existing row for this key may exist.
+ * Get the bdr.bdr_nodes record for the specififed node from the local
+ * bdr.bdr_nodes table via SPI.
  *
- * Unlike bdr_set_remote_status, '\0' may not be passed to delete the row, and
- * no upsert is performed. This is a simple insert only.
+ * Returns the status value, or NULL if no such row exists.
  *
- * Unlike bdr_nodes_get_local_status, only the status of the local node may
- * be set.
+ * SPI must be initialized, and you must be in a running transaction.
+ */
+BDRNodeInfo *
+bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli, Oid dboid)
+{
+   int         spi_ret;
+   Oid         argtypes[] = { TEXTOID, OIDOID, OIDOID };
+   Datum       values[3];
+   bool        isnull;
+   BDRNodeInfo *node;
+   char        sysid_str[33];
+   Oid         schema_oid;
+   MemoryContext caller_ctx;
+   MemoryContext saved_ctx PG_USED_FOR_ASSERTS_ONLY;
+
+   Assert(IsTransactionState());
+
+   /* Save the calling memory context, which we'll allocate results in */
+   caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+   Assert(MemoryContextIsValid(caller_ctx));
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   /*
+    * Determine if BDR is present on this DB. The output plugin can
+    * be started on a db that doesn't actually have BDR active, but
+    * we don't want to allow that.
+    *
+    * Check for a bdr schema.
+    */
+   schema_oid = GetSysCacheOid1(NAMESPACENAME, CStringGetDatum("bdr"));
+   if (schema_oid == InvalidOid)
+       ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+               errmsg("No bdr schema is present in database %s, cannot create a bdr slot",
+                      get_database_name(MyDatabaseId)),
+               errhint("There is no bdr.connections entry for this database on the target node or bdr is not in shared_preload_libraries")));
+
+   values[0] = CStringGetTextDatum(sysid_str);
+   values[1] = ObjectIdGetDatum(tli);
+   values[2] = ObjectIdGetDatum(dboid);
+
+   spi_ret = SPI_execute_with_args(
+           "SELECT node_status, node_local_dsn, node_init_from_dsn"
+           "  FROM bdr.bdr_nodes"
+           " WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3",
+           3, argtypes, values, NULL, false, 1);
+
+   if (spi_ret != SPI_OK_SELECT)
+       elog(ERROR, "Unable to query bdr.bdr_nodes, SPI error %d", spi_ret);
+
+   if (SPI_processed == 0)
+       return NULL;
+
+   /* Switch to calling memory context to copy results */
+   saved_ctx = MemoryContextSwitchTo(caller_ctx);
+   Assert(MemoryContextIsValid(saved_ctx));
+
+   node = palloc(sizeof(BDRNodeInfo));
+   node->sysid = sysid;
+   node->timeline = tli;
+   node->dboid = dboid;
+   node->status = DatumGetChar(SPI_getbinval(SPI_tuptable->vals[0],
+                                             SPI_tuptable->tupdesc, 1,
+                                             &isnull));
+   node->local_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+                                  SPI_tuptable->tupdesc, 2);
+   node->init_from_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+                                      SPI_tuptable->tupdesc, 3);
+
+   if (isnull)
+       elog(ERROR, "bdr.bdr_nodes.status NULL; shouldn't happen");
+
+   return node;
+}
+
+/* Free the BDRNodeInfo pointer including its properties. */
+void
+bdr_bdr_node_free(BDRNodeInfo *node)
+{
+   if (node == NULL)
+       return;
+
+   if (node->local_dsn)
+       pfree(node->local_dsn);
+   if (node->init_from_dsn)
+       pfree(node->init_from_dsn);
+   pfree(node);
+}
+
+/*
+ * Update the status field on the local node (as identified by current
+ * sysid,tlid,dboid) of bdr.bdr_nodes. The node record must already exist.
  *
- * SPI must be initialized, and you must be in a running transaction that is
- * not bound to any remote node replication state.
+ * Unlike bdr_nodes_get_local_status, this inteface does not accept
+ * sysid, tlid and dboid input but can only set the status of the local node.
  */
 void
 bdr_nodes_set_local_status(char status)
@@ -137,12 +233,21 @@ bdr_nodes_set_local_status(char status)
    Oid         argtypes[] = { CHAROID, TEXTOID, OIDOID, OIDOID };
    Datum       values[4];
    char        sysid_str[33];
+   bool        tx_started = false;
+   bool        spi_pushed;
 
-   Assert(status != '\0'); /* Cannot pass \0 to delete */
-   Assert(IsTransactionState());
+   Assert(status != '\0'); /* Cannot pass \0 */
    /* Cannot have replication apply state set in this tx */
    Assert(replication_origin_id == InvalidRepNodeId);
 
+   if (!IsTransactionState())
+   {
+       tx_started = true;
+       StartTransactionCommand();
+   }
+   spi_pushed = SPI_push_conditional();
+   SPI_connect();
+
    snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
             GetSystemIdentifier());
    sysid_str[sizeof(sysid_str)-1] = '\0';
@@ -153,17 +258,24 @@ bdr_nodes_set_local_status(char status)
    values[3] = ObjectIdGetDatum(MyDatabaseId);
 
    spi_ret = SPI_execute_with_args(
-                              "INSERT INTO bdr.bdr_nodes"
-                              " (node_status, node_sysid, node_timeline, node_dboid)"
-                              " VALUES ($1, $2, $3, $4);",
+                              "UPDATE bdr.bdr_nodes"
+                              "   SET node_status = $1"
+                              " WHERE node_sysid = $2"
+                              "   AND node_timeline = $3"
+                              "   AND node_dboid = $4;",
                               4, argtypes, values, NULL, false, 0);
 
-   if (spi_ret != SPI_OK_INSERT)
-       elog(ERROR, "Unable to insert row (status=%c, node_sysid="
+   if (spi_ret != SPI_OK_UPDATE)
+       elog(ERROR, "Unable to set status=%c of row (node_sysid="
                    UINT64_FORMAT ", node_timeline=%u, node_dboid=%u) "
-                   "into bdr.bdr_nodes: SPI error %d",
+                   "in bdr.bdr_nodes: SPI error %d",
                    status, GetSystemIdentifier(), ThisTimeLineID,
                    MyDatabaseId, spi_ret);
+
+   SPI_finish();
+   SPI_pop_conditional(spi_pushed);
+   if (tx_started)
+       CommitTransactionCommand();
 }
 
 /*
@@ -218,6 +330,301 @@ bdr_fetch_node_id_via_sysid(uint64 sysid, TimeLineID tli, Oid dboid)
    return GetReplicationIdentifier(ident, false);
 }
 
+/*
+ * Read connection configuration data from the DB and return zero or more
+ * matching palloc'd BdrConnectionConfig results in a list.
+ *
+ * A transaction must be open.
+ *
+ * The list and values are allocated in the calling memory context. By default
+ * this is the transaction memory context, but you can switch to contexts
+ * before calling.
+ *
+ * Each BdrConnectionConfig's char* fields are palloc'd values.
+ *
+ * Uses the SPI, so push/pop caller's SPI state if needed.
+ *
+ * May raise exceptions from queries, SPI errors, etc.
+ *
+ * If both an entry with conn_origin for this node and one with null
+ * conn_origin are found, only the one specific to this node is returned,
+ * as it takes precedence over any generic configuration entry.
+ */
+List*
+bdr_read_connection_configs()
+{
+   HeapTuple tuple;
+   StringInfoData query;
+   int         i;
+   int         ret;
+   List       *configs = NIL;
+   MemoryContext caller_ctx, saved_ctx;
+   char        sysid_str[33];
+   Datum       values[3];
+   Oid         types[3] = { TEXTOID, OIDOID, OIDOID };
+
+   Assert(IsTransactionState());
+
+   /* Save the calling memory context, which we'll allocate results in */
+   caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+   initStringInfo(&query);
+
+   /*
+    * Find a connections row specific to this origin node or if none
+    * exists, the default connection data for that node.
+    *
+    * Configurations for all nodes, including the local node, are read.
+    */
+   appendStringInfo(&query, "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+                            "  conn_sysid, conn_timeline, conn_dboid, "
+                            "  conn_dsn, conn_apply_delay, "
+                            "  conn_replication_sets, "
+                            "  conn_is_unidirectional, "
+                            "  conn_origin_dboid <> 0 AS origin_is_my_id "
+                            "FROM bdr.bdr_connections "
+                            "WHERE (conn_origin_sysid = '0' "
+                            "  AND  conn_origin_timeline = 0 "
+                            "  AND  conn_origin_dboid = 0) "
+                            "   OR (conn_origin_sysid = $1 "
+                            "  AND  conn_origin_timeline = $2 "
+                            "  AND  conn_origin_dboid = $3) "
+                            "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+                            "         conn_origin_sysid ASC NULLS LAST, "
+                            "         conn_timeline ASC NULLS LAST, "
+                            "         conn_dboid ASC NULLS LAST "
+                    );
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   values[0] = CStringGetTextDatum(&sysid_str[0]);
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+   SPI_connect();
+
+   ret = SPI_execute_with_args(query.data, 3, types, values, NULL, false, 0);
+
+   if (ret != SPI_OK_SELECT)
+       elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+   /* Switch to calling memory context to copy results */
+   saved_ctx = MemoryContextSwitchTo(caller_ctx);
+
+   for (i = 0; i < SPI_processed; i++)
+   {
+       Datum           tmp_datum;
+       bool            isnull;
+       ArrayType      *conn_replication_sets;
+       char           *tmp_sysid;
+
+       BdrConnectionConfig *cfg = palloc(sizeof(BdrConnectionConfig));
+
+       tuple = SPI_tuptable->vals[i];
+
+       /*
+        * Fetch tuple attributes
+        *
+        * Note: SPI_getvalue calls the output function for the type, so the
+        * string is allocated in our memory context and doesn't need copying.
+        */
+       tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+                                getattno("conn_sysid"));
+
+       if (sscanf(tmp_sysid, UINT64_FORMAT, &cfg->sysid) != 1)
+           elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_timeline"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->timeline = DatumGetObjectId(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_dboid"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->dboid = DatumGetObjectId(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_is_unidirectional"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->is_unidirectional = DatumGetBool(tmp_datum);
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("origin_is_my_id"),
+                                 &isnull);
+       Assert(!isnull);
+       cfg->origin_is_my_id = DatumGetBool(tmp_datum);
+
+
+       cfg->dsn = SPI_getvalue(tuple,
+                                            SPI_tuptable->tupdesc,
+                                            getattno("conn_dsn"));
+
+       tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                 getattno("conn_apply_delay"), &isnull);
+       if (isnull)
+           cfg->apply_delay = -1;
+       else
+           cfg->apply_delay = DatumGetInt32(tmp_datum);
+
+       /*
+        * Replication sets are stored in the catalogs as a text[]
+        * of identifiers, so we'll want to unpack that.
+        */
+
+       conn_replication_sets = (ArrayType*)
+           SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                         getattno("conn_replication_sets"), &isnull);
+
+       if (isnull)
+           cfg->replication_sets = NULL;
+       else
+       {
+           cfg->replication_sets =
+               bdr_textarr_to_identliststr(DatumGetArrayTypeP(conn_replication_sets));
+       }
+
+       configs = lcons(cfg, configs);
+
+   }
+
+   MemoryContextSwitchTo(saved_ctx);
+
+   SPI_finish();
+
+   MemoryContextSwitchTo(caller_ctx);
+
+   return configs;
+}
+
+void
+bdr_free_connection_config(BdrConnectionConfig *cfg)
+{
+   if (cfg->dsn != NULL)
+       pfree(cfg->dsn);
+   if (cfg->replication_sets != NULL)
+       pfree(cfg->replication_sets);
+}
+
+/*
+ * Fetch the connection configuration for the local node, i.e. the entry
+ * with our (conn_sysid, conn_tlid, conn_dboid).
+ */
+BdrConnectionConfig*
+bdr_get_connection_config(uint64 sysid, TimeLineID timeline, Oid dboid,
+                         bool missing_ok)
+{
+   List *configs;
+   ListCell *lc;
+   MemoryContext saved_ctx;
+   BdrConnectionConfig *found_config = NULL;
+   bool tx_started = false;
+
+   Assert(MyDatabaseId != InvalidOid);
+
+   if (!IsTransactionState())
+   {
+       tx_started = true;
+       StartTransactionCommand();
+   }
+
+   saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+   configs = bdr_read_connection_configs();
+   MemoryContextSwitchTo(saved_ctx);
+
+   /*
+    * TODO DYNCONF Instead of reading all configs and then discarding all but
+    * the interesting one, we should really be doing a different query that
+    * returns only the configuration of interest. As this runs only during apply
+    * worker startup the impact is negligible.
+    */
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = (BdrConnectionConfig*) lfirst(lc);
+
+       if (cfg->sysid == sysid
+           && cfg->timeline == timeline
+           && cfg->dboid == dboid)
+       {
+           found_config = cfg;
+           break;
+       }
+       else
+       {
+           bdr_free_connection_config(cfg);
+       }
+   }
+
+   if (found_config == NULL && !missing_ok)
+       elog(ERROR, "Failed to find expected bdr.connections row "
+                   "(conn_sysid,conn_timeline,conn_dboid) = "
+                   "("UINT64_FORMAT",%u,%u) "
+                   "in bdr.bdr_connections",
+                   sysid, timeline, dboid);
+
+   if (tx_started)
+       CommitTransactionCommand();
+
+   list_free(configs);
+
+   return found_config;
+}
+
+
+static int
+getattno(const char *colname)
+{
+   int attno;
+
+   attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+   if (attno == SPI_ERROR_NOATTRIBUTE)
+       elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+   return attno;
+}
+
+/*
+ * Given a text[] Datum guaranteed to contain no nulls, return an
+ * identifier-quoted comma-separated string allocated in the current memory
+ * context.
+ */
+static char*
+bdr_textarr_to_identliststr(ArrayType *textarray)
+{
+   Datum          *elems;
+   int             nelems, i;
+   StringInfoData  si;
+
+   deconstruct_array(textarray,
+                     TEXTOID, -1, false, 'i',
+                     &elems, NULL, &nelems);
+
+   if (nelems == 0)
+       return pstrdup("");
+
+   initStringInfo(&si);
+
+   appendStringInfoString(&si,
+       quote_identifier(TextDatumGetCString(elems[0])));
+   for (i = 1; i < nelems; i++)
+   {
+       appendStringInfoString(&si, ",");
+       appendStringInfoString(&si,
+           quote_identifier(TextDatumGetCString(elems[i])));
+   }
+
+   /*
+    * The stringinfo is on the stack, but its data element is palloc'd
+    * in the caller's context and can be returned safely.
+    */
+   return si.data;
+
+}
+
 /*
  * Helper to format node identity info into buffers, which must already be
  * allocated and big enough to hold a unit64 + terminator (33 bytes).
diff --git a/bdr_common.c b/bdr_common.c
new file mode 100644 (file)
index 0000000..c4ad1cf
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * bdr_common.c
+ *
+ * BiDirectionalReplication
+ *
+ * Utility functions that can be share between extension and cli
+ * (don't require server side libraries).
+ *
+ * Copyright (c) 2015, PostgreSQL Global Development Group
+ *
+ * bdr_common.c
+ */
+
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "nodes/pg_list.h"
+
+#include "bdr_internal.h"
+
+
+/*
+ * Format slot name string from node identifiers.
+ */
+void
+bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+             Oid dboid, Oid local_dboid)
+{
+   char        sysid_str[33];
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   snprintf(NameStr(*slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+            local_dboid, sysid_str, tlid, dboid,
+            EMPTY_REPLICATION_NAME);
+   NameStr(*slot_name)[NAMEDATALEN-1] = '\0';
+}
index d979bbe93f208c57e07c5f6b11c5df6e1fff514a..0447a11d9b51b5eda630eedee718823b2ac1ab53 100644 (file)
@@ -45,12 +45,6 @@ static Oid BdrConflictTypeOid = InvalidOid;
 static Oid BdrConflictResolutionOid = InvalidOid;
 static Oid BdrConflictHistorySeqId = InvalidOid;
 
-/*
- * All this code runs only in the context of an apply worker, so
- * we can access the apply worker state global safely
- */
-extern BdrApplyWorker *bdr_apply_worker;
-
 #define BDR_CONFLICT_HISTORY_COLS 30
 #define SYSID_DIGITS 33
 
index dfef1a5f772cd2a6761300738238dde454336269..e700cf27ebd8927b74798f944ec3a580311edf70 100644 (file)
@@ -773,7 +773,7 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
    bool        performs_writes = false;
    ListCell   *l;
 
-   if (bdr_always_allow_writes || !bdr_is_bdr_activated_db(MyDatabaseId))
+   if (bdr_always_allow_writes)
        goto done;
 
    /* identify whether this is a modifying statement */
@@ -786,6 +786,9 @@ BdrExecutorStart(QueryDesc *queryDesc, int eflags)
    if (!performs_writes)
        goto done;
 
+   if (!bdr_is_bdr_activated_db(MyDatabaseId))
+       goto done;
+
 #ifdef BUILDING_BDR
    bdr_locks_check_query();
 #endif
index 134d98010fe53b47ca4de3efa1099187f07fddfb..d0e98ef79bc11c14fa1031eb944f3e076bc33941 100644 (file)
@@ -13,6 +13,8 @@
 
 #include "postgres_fe.h"
 
+#include "getopt_long.h"
+
 #include "port.h"
 
 #include "libpq-fe.h"
 typedef struct RemoteInfo {
    uint64      sysid;
    TimeLineID  tlid;
-   Oid         dboid;
+   int         numdbs;
+   Oid        *dboids;
+   char      **dbnames;
 } RemoteInfo;
 
-static char            *argv0 = NULL;
-static const char  *progname;
-static uint64       system_identifier;
-static NameData         restore_point_name;
-static char            *data_dir = NULL;
-static char            *config_options = "";
-static char             pid_file[MAXPGPATH];
-static time_t       start_time;
+typedef struct NodeInfo {
+   uint64      remote_sysid;
+   TimeLineID  remote_tlid;
+   uint64      local_sysid;
+   TimeLineID  local_tlid;
+} NodeInfo;
+
+typedef enum {
+   VERBOSITY_NORMAL,
+   VERBOSITY_VERBOSE,
+   VERBOSITY_DEBUG
+} VerbosityLevelEnum;
+
+static char           *argv0 = NULL;
+static const char  *progname;
+static char           *data_dir = NULL;
+static char            pid_file[MAXPGPATH];
+static time_t      start_time;
+static VerbosityLevelEnum  verbosity = VERBOSITY_NORMAL;
 
 /* defined as static so that die() can close them */
 static PGconn      *local_conn = NULL;
 static PGconn      *remote_conn = NULL;
 
-BdrConnectionConfig    **bdr_connection_configs;
-size_t              bdr_connection_config_count;
-
 static void signal_handler(int sig);
 static void usage(void);
 static void die(const char *fmt,...)
 __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
-static void print_msg(const char *fmt,...)
-__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
+static void print_msg(VerbosityLevelEnum level, const char *fmt,...)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
 
-static int run_pg_ctl(const char *arg, const char *opts);
-static char *get_postgres_guc_value(char *guc, char *defval);
-static bool wait_postmaster_connection(void);
-static void wait_postgres_shutdown(void);
+static int run_pg_ctl(const char *arg);
+static void run_basebackup(const char *remote_connstr, const char *data_dir);
+static void wait_postmaster_connection(const char *connstr);
+static void wait_postmaster_shutdown(void);
 
-#ifdef BUILDING_UDR
-static void initialize_bdr(PGconn *conn);
-#endif
-static void remove_unwanted_state(void);
-static void initialize_replication_identifiers(char *remote_lsn);
-static void create_replication_identifier(PGconn *conn,
-               const char *remote_ident, char *remote_lsn);
-static char *create_restore_point(char *remote_connstr);
-static void initialize_replication_slots(bool init_replica);
-static void create_replication_slot(PGconn *conn, Name slot_name);
-static RemoteInfo *get_remote_info(PGconn *conn, char* aux_connstr);
-static Oid get_dboid_from_dbname(PGconn *conn, const char* dbname);
+static void validate_remote_node(PGconn *conn);
+static void initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+                                 char *remote_connstr);
+static void remove_unwanted_files(void);
+static void remove_unwanted_data(PGconn *conn, char *dbname);
+static void initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn);
+static char *create_restore_point(PGconn *conn, char *restore_point_name);
+static void initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid);
+static void bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr);
+
+static RemoteInfo *get_remote_info(char* connstr);
+
+static void initialize_data_dir(char *data_dir, char *connstr,
+                   char *postgresql_conf, char *pg_hba_conf);
 
 static uint64 GenerateSystemIdentifier(void);
-static int set_sysid(void);
+static int set_sysid(uint64 sysid);
 
-static void read_bdr_config(void);
 static void WriteRecoveryConf(PQExpBuffer contents);
+static void CopyConfFile(char *fromfile, char *tofile);
 
-static char *detect_local_conninfo(void);
-static char *detect_remote_conninfo(void);
-char *get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser);
-static char *PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values);
-static char *escapeConninfoValue(const char *val);
+char *get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser);
+static char *PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values);
+static void appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str);
 
-static bool parse_bool(const char *value, bool *result);
-static bool parse_bool_with_len(const char *value, size_t len, bool *result);
-static char *trimwhitespace(const char *str);
-static char    **split_list_guc(char *str, size_t *count);
-
-static bool is_pg_dir(char *path);
+static bool file_exists(const char *path);
+static bool is_pg_dir(const char *path);
+static void copy_file(char *fromfile, char *tofile);
 static char *find_other_exec_or_die(const char *argv0, const char *target, const char *versionstr);
 static bool postmaster_is_alive(pid_t pid);
 static long get_pgpid(void);
-static char **readfile(const char *path);
-static void free_readfile(char **optlines);
 
+static PGconn *
+connectdb(char *connstr, const char *dbname)
+{
+   PGconn *conn;
+   char   *connstring = connstr;
+
+   /* TODO: deparse and reconstruct the connection string properly. */
+   if (dbname)
+   {
+       PQExpBuffer  connbuf = createPQExpBuffer();
+
+       printfPQExpBuffer(connbuf, "%s dbname=", connstr);
+       appendPQExpBufferConnstrValue(connbuf, dbname);
+       connstring = pg_strdup(connbuf->data);
+       destroyPQExpBuffer(connbuf);
+   }
+
+   conn = PQconnectdb(connstring);
+   if (PQstatus(conn) != CONNECTION_OK)
+       die(_("Connection to database failed: %s, connection string was: %s\n"), PQerrorMessage(conn), connstring);
+
+   return conn;
+}
 
 void signal_handler(int sig)
 {
@@ -129,13 +159,40 @@ main(int argc, char **argv)
    int i;
    int c;
    PQExpBuffer recoveryconfcontents = createPQExpBuffer();
-   char *remote_lsn;
-   bool hot_standby;
+   RemoteInfo *remote_info;
+   NodeInfo    node_info;
+   char        restore_point_name[NAMEDATALEN];
+   char       *remote_lsn;
+   bool        stop = false;
+   int         optindex;
    char *local_connstr = NULL;
+   char *local_dbhost = NULL,
+        *local_dbport = NULL,
+        *local_dbuser = NULL;
    char *remote_connstr = NULL;
-   char *dbhost = NULL,
-        *dbport = NULL,
-        *dbuser = NULL;
+   char *remote_dbhost = NULL,
+        *remote_dbport = NULL,
+        *remote_dbuser = NULL;
+   char *postgresql_conf = NULL,
+        *pg_hba_conf = NULL,
+        *recovery_conf = NULL;
+
+   static struct option long_options[] = {
+       {"pgdata", required_argument, NULL, 'D'},
+       {"remote-dbname", required_argument, NULL, 'd'},
+       {"remote-host", required_argument, NULL, 'h'},
+       {"remote-port", required_argument, NULL, 'p'},
+       {"remote-user", required_argument, NULL, 'U'},
+       {"local-dbname", required_argument, NULL, 2},
+       {"local-host", required_argument, NULL, 3},
+       {"local-port", required_argument, NULL, 4},
+       {"local-user", required_argument, NULL, 5},
+       {"postgresql-conf", required_argument, NULL, 6},
+       {"hba-conf", required_argument, NULL, 7},
+       {"recovery-conf", required_argument, NULL, 8},
+       {"stop", no_argument, NULL, 's'},
+       {NULL, 0, NULL, 0}
+   };
 
    argv0 = argv[0];
    progname = get_progname(argv[0]);
@@ -156,30 +213,66 @@ main(int argc, char **argv)
    }
 
    /* Option parsing and validation */
-   while ((c = getopt(argc, argv, "D:d:h:o:p:U:")) != -1)
+   while ((c = getopt_long(argc, argv, "D:d:h:p:s:U:v", long_options, &optindex)) != -1)
    {
        switch (c)
        {
            case 'D':
                data_dir = pg_strdup(optarg);
                break;
-           case 'o':
-               config_options = pg_strdup(optarg);
-               break;
            case 'd':
                remote_connstr = pg_strdup(optarg);
                break;
            case 'h':
-               dbhost = pg_strdup(optarg);
+               remote_dbhost = pg_strdup(optarg);
                break;
            case 'p':
-               dbport = pg_strdup(optarg);
+               remote_dbport = pg_strdup(optarg);
                break;
            case 'U':
-               dbuser = pg_strdup(optarg);
+               remote_dbuser = pg_strdup(optarg);
+               break;
+           case 'v':
+               verbosity++;
+               break;
+           case 2:
+               local_connstr = pg_strdup(optarg);
+               break;
+           case 3:
+               local_dbhost = pg_strdup(optarg);
+               break;
+           case 4:
+               local_dbport = pg_strdup(optarg);
+               break;
+           case 5:
+               local_dbuser = pg_strdup(optarg);
+               break;
+           case 6:
+               {
+                   postgresql_conf = pg_strdup(optarg);
+                   if (postgresql_conf != NULL && !file_exists(postgresql_conf))
+                       die(_("The specified postgresql.conf file does not exist."));
+                   break;
+               }
+           case 7:
+               {
+                   pg_hba_conf = pg_strdup(optarg);
+                   if (pg_hba_conf != NULL && !file_exists(pg_hba_conf))
+                       die(_("The specified pg_hba.conf file does not exist."));
+                   break;
+               }
+           case 8:
+               {
+                   recovery_conf = pg_strdup(optarg);
+                   if (recovery_conf != NULL && !file_exists(recovery_conf))
+                       die(_("The specified recovery.conf file does not exist."));
+                   break;
+               }
+           case 's':
+               stop = true;
                break;
            default:
-               fprintf(stderr, _("%s: unknown option\n"), progname);
+               fprintf(stderr, _("Unknown option\n"));
                fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
                exit(1);
        }
@@ -187,110 +280,199 @@ main(int argc, char **argv)
 
    if (data_dir == NULL)
    {
-       fprintf(stderr, _("%s: no data directory specified\n"), progname);
+       fprintf(stderr, _("No data directory specified\n"));
        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
        exit(1);
    }
-   if (!is_pg_dir(data_dir))
-   {
-       die(_("%s: \"%s\" is not valid postgres data directory\n"), progname, data_dir);
-   }
-   snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
 
-   print_msg(_("%s: starting...\n"), progname);
+   remote_connstr = get_connstr(remote_connstr, remote_dbhost, remote_dbport, remote_dbuser);
+   local_connstr = get_connstr(local_connstr, local_dbhost, local_dbport, local_dbuser);
 
+   if (!remote_connstr || !strlen(remote_connstr))
+       die(_("Remote connection must be specified.\n"));
+   if (!local_connstr || !strlen(local_connstr))
+       die(_("Local connection must be specified.\n"));
+
+   print_msg(VERBOSITY_NORMAL, _("%s: starting ...\n"), progname);
+
+   /*
+    * Generate new identifier for local node.
+    */
+   node_info.local_sysid = GenerateSystemIdentifier();
+   print_msg(VERBOSITY_VERBOSE,
+             _("Generated new local system identifier: "UINT64_FORMAT"\n"),
+             node_info.local_sysid);
+
+   /* Read the remote server indetification. */
+   print_msg(VERBOSITY_NORMAL,
+             _("Getting remote server identification ...\n"));
+   remote_info = get_remote_info(remote_connstr);
+
+   /* If there are no BDR enabled dbs, just bail. */
+   if (remote_info->numdbs < 1)
+       die(_("Remote node does not have any BDR enabled databases.\n"));
+
+   print_msg(VERBOSITY_NORMAL,
+             _("Detected %d BDR database(s) on remote server\n"),
+             remote_info->numdbs);
+
+   node_info.remote_sysid = remote_info->sysid;
+   node_info.remote_tlid = remote_info->tlid;
    /*
-    * Initialization
+    * Once the physical replication reaches the restore point, it will
+    * bump the timeline by one.
     */
-   system_identifier = GenerateSystemIdentifier();
-   print_msg(_("Assigning new system identifier: "UINT64_FORMAT"...\n"), system_identifier);
+   node_info.local_tlid = remote_info->tlid + 1;
 
-   read_bdr_config();
+   print_msg(VERBOSITY_NORMAL,
+             _("Updating BDR configuration on the remote node:\n"));
 
-   if (!remote_connstr && !dbhost && !dbport && !dbuser)
-       remote_connstr = detect_remote_conninfo();
-   else
-       remote_connstr = get_conninfo(remote_connstr, dbhost, dbport, dbuser);
+   /* Initialize remote node. */
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       char *dbname = remote_info->dbnames[i];
+       remote_conn = connectdb(remote_connstr, dbname);
 
-   if (!remote_connstr || !strlen(remote_connstr))
-       die(_("Could not detect remote connection\n"));
+       /*
+        * Make sure that we can use the remote node as init node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: validating BDR configuration ...\n"), dbname);
+       validate_remote_node(remote_conn);
+
+       /*
+        * Create replication slots on remote node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: creating replication slot ...\n"), dbname);
+       initialize_replication_slot(remote_conn, &node_info, remote_info->dboids[i]);
 
-   local_connstr = detect_local_conninfo();
-   if (local_connstr == NULL)
-       die(_("Failed to detect local connection info. Please specify replica_local_dsn in the postgresql.conf.\n"));
+       /*
+        * Create node entry for future local node.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: creating node entry for local node ...\n"), dbname);
+       initialize_node_entry(remote_conn, &node_info, remote_info->dboids[i],
+                             remote_connstr);
 
-   /* Hot standby would start cluster in read only mode, we don't want that. */
-   if (!parse_bool(get_postgres_guc_value("hot_standby", NULL), &hot_standby))
-       die(_("Invalid boolean value for configuration parameter \"hot_standby\"\n"));
-   if (hot_standby)
-       die(_("Cluster cannot be configured with hot_standby = on when using bdr\n"));
+       /* Don't hold connection since the next step might take long time. */
+       PQfinish(remote_conn);
+       local_conn = NULL;
+   }
 
-   remove_unwanted_state();
+   /*
+    * Create basebackup or use existing one
+    */
+   initialize_data_dir(data_dir, remote_connstr, postgresql_conf, pg_hba_conf);
+   snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
 
    /*
-    * Initialization done, create replication slots to init node
-    * and restore point on remote side.
+    * Create restore point to which we will catchup via physical replication.
     */
-   print_msg(_("Creating primary replication slots...\n"));
-   initialize_replication_slots(true);
+   remote_conn = PQconnectdb(remote_connstr);
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+       die(_("Connection to remote node failed: %s"), PQerrorMessage(remote_conn));
+
+   print_msg(VERBOSITY_NORMAL, _("Creating restore point on remote node ...\n"));
+
+   snprintf(restore_point_name, NAMEDATALEN,
+            "bdr_"UINT64_FORMAT, node_info.local_sysid);
+   remote_lsn = create_restore_point(remote_conn, restore_point_name);
 
-   print_msg(_("Creating restore point...\n"));
-   snprintf(NameStr(restore_point_name), NAMEDATALEN,
-            "bdr_"UINT64_FORMAT, system_identifier);
-   remote_lsn = create_restore_point(remote_connstr);
+   PQfinish(remote_conn);
 
    /*
     * Get local db to consistent state (for lsn after slot creation).
     */
-   print_msg(_("Bringing cluster to the restore point...\n"));
-   appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
-   appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", NameStr(restore_point_name));
+   print_msg(VERBOSITY_NORMAL,
+             _("Bringing local node to the restore point ...\n"));
+   if (recovery_conf)
+   {
+       CopyConfFile(recovery_conf, "recovery.conf");
+   }
+   else
+   {
+       appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
+       appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
+   }
+   appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", restore_point_name);
    appendPQExpBuffer(recoveryconfcontents, "recovery_target_inclusive = true\n");
-   appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
    WriteRecoveryConf(recoveryconfcontents);
 
-   run_pg_ctl("start -w -l \"bdr_init_copy_postgres.log\"",
-#ifdef BUILDING_BDR
-              "-c shared_preload_libraries=''"
-#else
-              ""
-#endif
-              );
-   if (!wait_postmaster_connection())
-       die(_("Could not connect to local node"));
+   /*
+    * Start local node with BDR disabled, and wait until it starts accepting
+    * connections which means it has caught up to the restore point.
+    */
+   run_pg_ctl("start -l \"bdr_init_copy_postgres.log\" -o \"-c shared_preload_libraries=''\"");
+   wait_postmaster_connection(local_connstr);
 
    /*
-    * Postgres should have reached restore point and is accepting connections,
-    * create slots to other nodes and local replication identifiers.
+    * Clean any per-node data that were copied by pg_basebackup.
     */
-   local_conn = PQconnectdb(local_connstr);
-   if (PQstatus(local_conn) != CONNECTION_OK)
-       die(_("Connection to database failed: %s"), PQerrorMessage(local_conn));
-
-#ifdef BUILDING_UDR
-   print_msg(_("Ensuring bdr extension is installed...\n"));
-   initialize_bdr(remote_conn);
-   initialize_bdr(local_conn);
-#endif
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       local_conn = connectdb(local_connstr, remote_info->dbnames[i]);
+
+       remove_unwanted_data(local_conn, remote_info->dbnames[i]);
+
+       PQfinish(local_conn);
+       local_conn = NULL;
+   }
 
-   print_msg(_("Creating secondary replication slots...\n"));
-   initialize_replication_slots(false);
-   print_msg(_("Creating local replication identifier...\n"));
-   initialize_replication_identifiers(remote_lsn);
+   /* Stop Postgres so we can reset system id and start it with BDR loaded. */
+   run_pg_ctl("stop");
+   wait_postmaster_shutdown();
 
-   PQfinish(local_conn);
-   local_conn = NULL;
+   /*
+    * Individualize the local node by changing the system identifier.
+    */
+   set_sysid(node_info.local_sysid);
 
    /*
-    * Make this node functional as individual bdr node and start it.
+    * Start the node again, now with BDR active so that we can join the node
+    * to the BDR cluster. This is final start, so don't log to to special log
+    * file anymore.
     */
-   run_pg_ctl("stop", "");
-   wait_postgres_shutdown();
+   print_msg(VERBOSITY_NORMAL,
+             _("Initializing BDR on the local node:\n"));
+
+   run_pg_ctl("start -l \"bdr_init_copy_postgres.log\"");
+   wait_postmaster_connection(local_connstr);
+
+   for (i = 0; i < remote_info->numdbs; i++)
+   {
+       char *dbname = remote_info->dbnames[i];
+
+       local_conn = connectdb(local_connstr, dbname);
+
+       /*
+        * Create the identifier which is setup with the position to which we already
+        * caught up using physical replication.
+        */
+       print_msg(VERBOSITY_VERBOSE,
+                 _(" %s: creating replication identifier ...\n"), dbname);
+       initialize_replication_identifier(local_conn, &node_info, remote_info->dboids[i], remote_lsn);
+
+       /*
+        * And finally add the node to the cluster.
+        */
+       print_msg(VERBOSITY_NORMAL,
+                 _(" %s: adding the database to BDR cluster ...\n"), dbname);
+       bdr_node_start(local_conn, remote_connstr, local_connstr);
 
-   set_sysid();
+       PQfinish(local_conn);
+       local_conn = NULL;
+   }
+
+   /* If user does not want the node to be running at the end, stop it. */
+   if (stop)
+   {
+       print_msg(VERBOSITY_NORMAL, _("Stopping the local node ...\n"));
+       run_pg_ctl("stop");
+       wait_postmaster_shutdown();
+   }
 
-   print_msg(_("Starting the cluster...\n"));
-   run_pg_ctl("start -w", "-c bdr.init_from_basedump=true");
+   print_msg(VERBOSITY_NORMAL, _("All done\n"));
 
    return 0;
 }
@@ -302,19 +484,29 @@ main(int argc, char **argv)
 static void
 usage(void)
 {
-   printf(_("%s initializes bdr from PostgreSQL instance made using pg_basebackup.\n\n"), progname);
-   printf(_("pg_basebackup -X stream must be used to populate the data directory before\n"));
-   printf(_("running %s to initialize BDR on it.\n\n"), progname);
+   printf(_("%s initializes new BDR node from existing BDR instance.\n\n"), progname);
    printf(_("Usage:\n"));
    printf(_("  %s [OPTION]...\n"), progname);
    printf(_("\nGeneral options:\n"));
-   printf(_("  -D, --pgdata=DIRECTORY base backup directory\n"));
-   printf(_("  -o                     configuration options passed to pg_ctl's -o\n"));
+   printf(_("  -D, --pgdata=DIRECTORY data directory to be used for new nodem,\n"));
+   printf(_("                         can be either empty/non-existing directory,\n"));
+   printf(_("                         or directory populated using pg_basebackup -X stream\n"));
+   printf(_("                         command\n"));
+   printf(_("  -s, --stop             stop the server once the initialization is done\n"));
+   printf(_("  --postgresql-conf      path to the new postgresql.conf\n"));
+   printf(_("  --hba-conf             path to the new pg_hba.conf\n"));
+   printf(_("  --recovery-conf        path to the template recovery.conf\n"));
    printf(_("\nConnection options:\n"));
-   printf(_("  -d, --dbname=CONNSTR   connection string\n"));
-   printf(_("  -h, --host=HOSTNAME    database server host or socket directory\n"));
-   printf(_("  -p, --port=PORT        database server port number\n"));
-   printf(_("  -U, --username=NAME    connect as specified database user\n"));
+   printf(_("  -d, --remote-dbname=CONNSTR\n"));
+   printf(_("                         connection string for remote node\n"));
+   printf(_("  -h, --remote-host=HOSTNAME\n"));
+   printf(_("                         server host or socket directory for remote node\n"));
+   printf(_("  -p, --remote-port=PORT server port number for remote node\n"));
+   printf(_("  -U, --remote-user=NAME connect as specified database user to the remote node\n"));
+   printf(_("  --local-dbname=CONNSTR connection string for local node\n"));
+   printf(_("  --local-host=HOSTNAME  server host or socket directory for local node\n"));
+   printf(_("  --local-port=PORT      server port number for local node\n"));
+   printf(_("  --local-user=NAME      connect as specified database user to the local node\n"));
 }
 
 /*
@@ -328,11 +520,13 @@ die(const char *fmt,...)
    vfprintf(stderr, fmt, argptr);
    va_end(argptr);
 
-   PQfinish(local_conn);
-   PQfinish(remote_conn);
+   if (local_conn)
+       PQfinish(local_conn);
+   if (remote_conn)
+       PQfinish(remote_conn);
 
    if (get_pgpid())
-       run_pg_ctl("stop -s", "");
+       run_pg_ctl("stop -s");
 
    exit(1);
 }
@@ -341,13 +535,16 @@ die(const char *fmt,...)
  * Print message to stdout and flush
  */
 static void
-print_msg(const char *fmt,...)
+print_msg(VerbosityLevelEnum level, const char *fmt,...)
 {
-   va_list argptr;
-   va_start(argptr, fmt);
-   vfprintf(stdout, fmt, argptr);
-   va_end(argptr);
-   fflush(stdout);
+   if (verbosity >= level)
+   {
+       va_list argptr;
+       va_start(argptr, fmt);
+       vfprintf(stdout, fmt, argptr);
+       va_end(argptr);
+       fflush(stdout);
+   }
 }
 
 
@@ -355,15 +552,19 @@ print_msg(const char *fmt,...)
  * Start pg_ctl with given argument(s) - used to start/stop postgres
  */
 static int
-run_pg_ctl(const char *arg, const char *opts)
+run_pg_ctl(const char *arg)
 {
    int          ret;
    PQExpBuffer  cmd = createPQExpBuffer();
    char        *exec_path = find_other_exec_or_die(argv0, "pg_ctl", "pg_ctl (PostgreSQL) " PG_VERSION "\n");
 
-   appendPQExpBuffer(cmd, "%s %s -D \"%s\" -o \"%s %s\"", exec_path, arg, data_dir,
-                     opts, config_options);
+   appendPQExpBuffer(cmd, "%s %s -D \"%s\" -s", exec_path, arg, data_dir);
+
+   /* Run pg_ctl in silent mode unless we run in debug mode. */
+   if (verbosity < VERBOSITY_DEBUG)
+       appendPQExpBuffer(cmd, " -s");
 
+   print_msg(VERBOSITY_DEBUG, _("Running pg_ctl: %s.\n"), cmd->data);
    ret = system(cmd->data);
 
    destroyPQExpBuffer(cmd);
@@ -373,53 +574,43 @@ run_pg_ctl(const char *arg, const char *opts)
 
 
 /*
- * Ugly way to read postgresql.conf
+ * Run pg_basebackup to create the copy of the origin node.
  */
-static char *
-get_postgres_guc_value(char *guc, char *defval)
+static void
+run_basebackup(const char *remote_connstr, const char *data_dir)
 {
-   FILE        *fp;
-   int          status;
+   int          ret;
    PQExpBuffer  cmd = createPQExpBuffer();
-   char        *exec_path = find_other_exec_or_die(argv0, "postgres", PG_BACKEND_VERSIONSTR);
-   PQExpBuffer  retbuf = createPQExpBuffer();
-   char         buf[8192];
-   char        *ret;
-
-   printfPQExpBuffer(cmd, "%s -D \"%s\" %s -C \"%s\" 2>\"%s\"",
-                     exec_path, data_dir, config_options, guc, DEVNULL);
+   char        *exec_path = find_other_exec_or_die(argv0, "pg_basebackup", "pg_basebackup (PostgreSQL) " PG_VERSION "\n");
 
-   fp = popen(cmd->data, "r");
-   while (fgets(buf, sizeof(buf), fp) != NULL)
-       appendPQExpBufferStr(retbuf, buf);
+   appendPQExpBuffer(cmd, "%s -D \"%s\" -d \"%s\" -X s -P", exec_path, data_dir, remote_connstr);
 
-   status = pclose(fp);
-   destroyPQExpBuffer(cmd);
+   /* Run pg_basebackup in verbose mode if we are running in verbose mode. */
+   if (verbosity >= VERBOSITY_VERBOSE)
+       appendPQExpBuffer(cmd, " -v");
 
-   if (status != 0)
-   {
-       destroyPQExpBuffer(retbuf);
-       return defval;
-   }
+   print_msg(VERBOSITY_DEBUG, _("Running pg_basebackup: %s.\n"), cmd->data);
+   ret = system(cmd->data);
 
-   ret = trimwhitespace(retbuf->data);
-   destroyPQExpBuffer(retbuf);
+   destroyPQExpBuffer(cmd);
 
-   return ret;
+   if (ret != 0)
+       die(_("pg_basebackup failed, cannot continue.\n"));
 }
 
 /*
  * Set system identifier to system id we used for registering the slots.
  */
 static int
-set_sysid(void)
+set_sysid(uint64 sysid)
 {
    int          ret;
    PQExpBuffer  cmd = createPQExpBuffer();
    char        *exec_path = find_other_exec_or_die(argv0, "bdr_resetxlog", "bdr_resetxlog (PostgreSQL) " PG_VERSION "\n");
 
-   appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, system_identifier, data_dir);
+   appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, sysid, data_dir);
 
+   print_msg(VERBOSITY_DEBUG, _("Running bdr_resetxlog: %s.\n"), cmd->data);
    ret = system(cmd->data);
 
    destroyPQExpBuffer(cmd);
@@ -427,105 +618,11 @@ set_sysid(void)
    return ret;
 }
 
-
-/*
- * Read bdr configuration
- *
- * This is somewhat ugly version of bdr_create_con_gucs and parts of _PG_init
- */
-static void
-read_bdr_config(void)
-{
-   char        *connections;
-   char        *errormsg = NULL;
-   int         connection_config_idx;
-   size_t      connection_count = 0;
-   char        **connames;
-   PQconninfoOption *options;
-   PQconninfoOption *cur_option;
-
-   connections = get_postgres_guc_value("bdr.connections", NULL);
-   if (!connections)
-       die(_("bdr.connections is empty\n"));
-
-   connames = split_list_guc(connections, &connection_count);
-   pg_free(connections);
-
-   bdr_connection_config_count = connection_count;
-   bdr_connection_configs = (BdrConnectionConfig**)
-       pg_malloc0(bdr_connection_config_count * sizeof(BdrConnectionConfig*));
-
-   for (connection_config_idx = 0; connection_config_idx < connection_count; connection_config_idx++)
-   {
-       char    *name = (char *) connames[connection_config_idx];
-       char    *optname_dsn = pg_malloc(strlen(name) + 30);
-       char    *optname_local_dsn = pg_malloc(strlen(name) + 30);
-       char    *optname_replica = pg_malloc(strlen(name) + 30);
-       char    *optname_local_dbname = pg_malloc(strlen(name) + 30);
-       BdrConnectionConfig *opts;
-
-       sprintf(optname_dsn, "bdr.%s_dsn", name);
-       sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
-       sprintf(optname_replica, "bdr.%s_init_replica", name);
-       sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
-
-       opts = pg_malloc0(sizeof(BdrConnectionConfig));
-       opts->name = pg_strdup(name);
-       opts->is_valid = false;
-
-       bdr_connection_configs[connection_config_idx] = opts;
-
-       opts->dsn = get_postgres_guc_value(optname_dsn, NULL);
-       if (!opts->dsn)
-           continue;
-
-       opts->replica_local_dsn = get_postgres_guc_value(optname_local_dsn, NULL);
-
-       if (!parse_bool(get_postgres_guc_value(optname_replica, "false"), &opts->init_replica))
-           die(_("Invalid boolean value for configuration parameter \"%s\"\n"), optname_replica);
-
-       opts->dbname = get_postgres_guc_value(optname_local_dbname, NULL);
-
-       options = PQconninfoParse(opts->dsn, &errormsg);
-       if (errormsg != NULL)
-       {
-           char *str = pg_strdup(errormsg);
-
-           PQfreemem(errormsg);
-           die(_("bdr %s: error in dsn: %s\n"), name, str);
-       }
-
-       if (opts->dbname == NULL)
-       {
-           cur_option = options;
-           while (cur_option->keyword != NULL)
-           {
-               if (strcmp(cur_option->keyword, "dbname") == 0)
-               {
-                   if (cur_option->val == NULL)
-                       die(_("bdr %s: no dbname set\n"), name);
-
-                   opts->dbname = pg_strdup(cur_option->val);
-               }
-               cur_option++;
-           }
-       }
-
-
-       opts->is_valid = true;
-
-       /* cleanup */
-       PQconninfoFree(options);
-   }
-}
-
-
-
 /*
  * Cleans everything that was replicated via basebackup but we don't want it.
  */
 static void
-remove_unwanted_state(void)
+remove_unwanted_files(void)
 {
 #ifdef BUILDING_BDR
    DIR             *lldir;
@@ -535,6 +632,9 @@ remove_unwanted_state(void)
 
    printfPQExpBuffer(llpath, "%s/%s", data_dir, LLOGCDIR);
 
+   print_msg(VERBOSITY_DEBUG, _("Removing data from \"%s\" directory.\n"),
+             llpath->data);
+
    /*
     * Remove stray logical replication checkpoints
     */
@@ -577,121 +677,123 @@ remove_unwanted_state(void)
 #endif
 }
 
-
 /*
- * Initialize replication slots
+ * Init the datadir
  *
- * Get connection configs from bdr and use the info
- * to register replication slots for future use.
+ * This function can either ensure provided datadir is a postgres datadir,
+ * or create it using pg_basebackup.
+ *
+ * In any case, new postresql.conf and pg_hba.conf will be copied to the
+ * datadir if they are provided.
  */
 static void
-initialize_replication_slots(bool init_replica)
+initialize_data_dir(char *data_dir, char *connstr,
+                   char *postgresql_conf, char *pg_hba_conf)
 {
-   int      i;
-
-   for (i = 0; i < bdr_connection_config_count; i++)
+   /* Run basebackup as needed. */
+   switch (pg_check_dir(data_dir))
    {
-       NameData     slot_name;
-       char         remote_ident[256];
-       RemoteInfo  *ri;
-       TimeLineID   tlid;
-       Oid          dboid;
-       char         system_identifier_s[32];
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-       PQExpBuffer      conninfo = createPQExpBuffer();
-
-       if (!cfg || !cfg->is_valid || cfg->init_replica != init_replica)
-           continue;
-
-       printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
-       remote_conn = PQconnectdb(conninfo->data);
-       destroyPQExpBuffer(conninfo);
-
-       if (PQstatus(remote_conn) != CONNECTION_OK)
-       {
-           die(_("Could not connect to the remote server: %s\n"),
-                       PQerrorMessage(remote_conn));
-       }
-
-       ri = get_remote_info(remote_conn, cfg->dsn);
-       dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
-
-       /* XXX: this might break if timeline switch happens in meantime */
-       tlid = cfg->init_replica ? ri->tlid + 1 : ri->tlid;
-
-       snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, system_identifier);
-       snprintf(NameStr(slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
-                ri->dboid, system_identifier_s, tlid,
-                dboid, "");
-       NameStr(slot_name)[NAMEDATALEN - 1] = '\0';
-
-       create_replication_slot(remote_conn, &slot_name);
+       case 0:     /*Does not exist */
+       case 1:     /* Exists, empty */
+           {
+               if (connstr)
+               {
+                   print_msg(VERBOSITY_NORMAL,
+                             _("Creating base backup of the remote node...\n"));
+                   run_basebackup(connstr, data_dir);
+               }
+               else
+                   die(_("Directory \"%s\" does not exist.\n"),
+                       data_dir);
+               break;
+           }
+       case 2:
+       case 3:     /* Exists, not empty */
+       case 4:
+           {
+               if (!is_pg_dir(data_dir))
+                   die(_("Directory \"%s\" exists but is not valid postgres data directory.\n"),
+                       data_dir);
+               break;
+           }
+       case -1:    /* Access problem */
+           die(_("Could not access directory \"%s\": %s.\n"),
+               data_dir, strerror(errno));
+   }
 
-       PQfinish(remote_conn);
-       remote_conn = NULL;
+   remove_unwanted_files();
 
-       snprintf(remote_ident, sizeof(remote_ident),
-               BDR_NODE_ID_FORMAT,
-               ri->sysid, ri->tlid, ri->dboid, dboid,
-               "");
-   }
+   if (postgresql_conf)
+       CopyConfFile(postgresql_conf, "postgresql.conf");
+   if (pg_hba_conf)
+       CopyConfFile(pg_hba_conf, "pg_hba.conf");
 }
 
 /*
- * Get database Oid of the remotedb.
+ * Initialize replication slots
  *
- * Can't use the bdr_get_remote_dboid because it needs elog :(
+ * Get connection configs from bdr and use the info
+ * to register replication slots for future use.
  */
-static Oid
-get_remote_dboid(char *conninfo_db)
+static void
+initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid)
 {
-   PGconn     *dbConn;
+   char        slotname[NAMEDATALEN];
+   char        system_identifier_s[32];
+   PQExpBuffer query = createPQExpBuffer();
    PGresult   *res;
-   char       *remote_dboid;
-   Oid         remote_dboid_i;
-
-   dbConn = PQconnectdb(conninfo_db);
-   if (PQstatus(dbConn) != CONNECTION_OK)
-   {
-       die(_("Could not connect to the primary server: %s"), PQerrorMessage(dbConn));
-   }
 
-   res = PQexec(dbConn, "SELECT oid FROM pg_database WHERE datname = current_database()");
-   if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       die(_("Could fetch database oid: %s"), PQerrorMessage(dbConn));
+   snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, ni->local_sysid);
+   snprintf(slotname, NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+            dboid, system_identifier_s, ni->local_tlid, dboid, "");
+   appendPQExpBuffer(query, "SELECT pg_create_logical_replication_slot(%s, '%s');",
+                     PQescapeLiteral(conn, slotname, NAMEDATALEN), "bdr");
 
-   if (PQntuples(res) != 1 || PQnfields(res) != 1)
-       die(_("Could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"),
-            PQntuples(res), PQnfields(res), 1, 1);
+   res = PQexec(conn, query->data);
 
-   remote_dboid = PQgetvalue(res, 0, 0);
-   if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
-       die(_("could not parse remote database OID %s"), remote_dboid);
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       die(_("Could not create replication slot, status %s: %s\n"),
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+   }
 
    PQclear(res);
-   PQfinish(dbConn);
-
-   return remote_dboid_i;
+   destroyPQExpBuffer(query);
 }
 
 /*
  * Read replication info about remote connection
  */
 static RemoteInfo *
-get_remote_info(PGconn *conn, char* aux_connstr)
+get_remote_info(char* remote_connstr)
 {
-   RemoteInfo  *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
+   RemoteInfo *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
    char       *remote_sysid;
    char       *remote_tlid;
-   char       *remote_dboid;
+   int         i;
    PGresult   *res;
+   PQExpBuffer conninfo = createPQExpBuffer();
+
+   /*
+    * Fetch the system identification info (sysid, tlid) via replication
+    * connection - there is no way to get this info via SQL.
+    */
+   printfPQExpBuffer(conninfo, "%s replication=database", remote_connstr);
+   remote_conn = PQconnectdb(conninfo->data);
+   destroyPQExpBuffer(conninfo);
+
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+   {
+       die(_("Could not connect to the remote server: %s\n"),
+                   PQerrorMessage(remote_conn));
+   }
 
-   res = PQexec(conn, "IDENTIFY_SYSTEM");
+   res = PQexec(remote_conn, "IDENTIFY_SYSTEM");
    if (PQresultStatus(res) != PGRES_TUPLES_OK)
    {
        PQclear(res);
        die(_("Could not send replication command \"%s\": %s\n"),
-            "IDENTIFY_SYSTEM", PQerrorMessage(conn));
+            "IDENTIFY_SYSTEM", PQerrorMessage(remote_conn));
    }
 
    if (PQntuples(res) != 1 || PQnfields(res) < 4 || PQnfields(res) > 5)
@@ -704,17 +806,6 @@ get_remote_info(PGconn *conn, char* aux_connstr)
    remote_sysid = PQgetvalue(res, 0, 0);
    remote_tlid = PQgetvalue(res, 0, 1);
 
-   if (PQnfields(res) == 5)
-   {
-       remote_dboid = PQgetvalue(res, 0, 4);
-       if (sscanf(remote_dboid, "%u", &ri->dboid) != 1)
-           die(_("could not parse remote database OID %s"), remote_dboid);
-   }
-   else
-   {
-       ri->dboid = get_remote_dboid(aux_connstr);
-   }
-
 #ifdef HAVE_STRTOULL
    ri->sysid = strtoull(remote_sysid, NULL, 10);
 #else
@@ -725,97 +816,97 @@ get_remote_info(PGconn *conn, char* aux_connstr)
        die(_("Could not parse remote tlid %s\n"), remote_tlid);
 
    PQclear(res);
+   PQfinish(remote_conn);
+   remote_conn = NULL;
 
-   return ri;
-}
+   /*
+    * Fetch list of BDR enabled databases via standard SQL connection.
+    */
+   remote_conn = PQconnectdb(remote_connstr);
+   if (PQstatus(remote_conn) != CONNECTION_OK)
+   {
+       die(_("Could not connect to the remote server: %s"), PQerrorMessage(remote_conn));
+   }
 
-/*
- * Get dboid based on dbname
- */
-static Oid
-get_dboid_from_dbname(PGconn *conn, const char* dbname)
-{
-   char        *dboid_str;
-   Oid          dboid;
-   PQExpBuffer  query = createPQExpBuffer();
-   PGresult    *res;
+   res = PQexec(remote_conn, "SELECT d.oid, d.datname "
+                "FROM pg_catalog.pg_database d, pg_catalog.pg_shseclabel l "
+                "WHERE l.provider = 'bdr' "
+                "  AND l.classoid = 'pg_database'::regclass "
+                "  AND d.oid = l.objoid;");
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       die(_("Could fetch remote database list: %s"), PQerrorMessage(remote_conn));
 
-   appendPQExpBuffer(query, "SELECT oid FROM pg_catalog.pg_database WHERE datname = '%s'",
-                    dbname);
+   ri->numdbs = PQntuples(res);
+   ri->dboids = (Oid *) pg_malloc(ri->numdbs * sizeof(Oid));
+   ri->dbnames = (char **) pg_malloc(ri->numdbs * sizeof(char *));
 
-   res = PQexec(conn, query->data);
-   if (PQresultStatus(res) != PGRES_TUPLES_OK || PQntuples(res) != 1)
+   for (i = 0; i < ri->numdbs; i++)
    {
-       PQclear(res);
-       die(_("Could not get database id for \"%s\": %s\n"),
-            dbname, PQerrorMessage(conn));
-   }
+       char   *remote_dboid = PQgetvalue(res, i, 0);
+       char   *remote_dbname = PQgetvalue(res, i, 1);
+       Oid     remote_dboid_i;
 
-   dboid_str = PQgetvalue(res, 0, 0);
-   if (sscanf(dboid_str, "%u", &dboid) != 1)
-       die(_("Could not parse database OID %s\n"), dboid_str);
+       if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
+           die(_("Could not parse database OID %s"), remote_dboid);
+
+       ri->dboids[i] = remote_dboid_i;
+       ri->dbnames[i] = pstrdup(remote_dbname);
+   }
 
    PQclear(res);
-   destroyPQExpBuffer(query);
 
-   return dboid;
+   PQfinish(remote_conn);
+   remote_conn = NULL;
+
+   return ri;
 }
 
+
 /*
- * Create replication slot
+ * Check if extension exists.
  */
-static void
-create_replication_slot(PGconn *conn, Name slot_name)
+static bool
+extension_exists(PGconn *conn, const char *extname)
 {
-   PQExpBuffer query = createPQExpBuffer();
-   PGresult   *res;
-
-   appendPQExpBuffer(query, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
-                    NameStr(*slot_name), "bdr");
+   PQExpBuffer     query = createPQExpBuffer();
+   PGresult       *res;
+   bool            ret;
 
+   printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
+                     PQescapeLiteral(conn, extname, strlen(extname)));
    res = PQexec(conn, query->data);
 
    if (PQresultStatus(res) != PGRES_TUPLES_OK)
    {
-       die(_("Could not send replication command \"%s\": status %s: %s\n"),
-            query->data,
-            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       PQclear(res);
+       die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
    }
 
+   ret = PQntuples(res) == 1;
+
    PQclear(res);
    destroyPQExpBuffer(query);
+
+   return ret;
 }
 
-#ifdef BUILDING_UDR
+/*
+ * Create extension.
+ */
 static void
-install_extension_if_not_exists(PGconn *conn, const char *extname)
+install_extension(PGconn *conn, const char *extname)
 {
    PQExpBuffer     query = createPQExpBuffer();
    PGresult       *res;
 
-   printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
-                     PQescapeLiteral(conn, extname, strlen(extname)));
+   printfPQExpBuffer(query, "CREATE EXTENSION %s;",
+                     PQescapeIdentifier(conn, extname, strlen(extname)));
    res = PQexec(conn, query->data);
 
-   if (PQresultStatus(res) != PGRES_TUPLES_OK)
-   {
-       PQclear(res);
-       die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
-   }
-
-   if (PQntuples(res) != 1)
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
    {
        PQclear(res);
-
-       printfPQExpBuffer(query, "CREATE EXTENSION %s;",
-                         PQescapeIdentifier(conn, extname, strlen(extname)));
-       res = PQexec(conn, query->data);
-
-       if (PQresultStatus(res) != PGRES_COMMAND_OK)
-       {
-           PQclear(res);
-           die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
-       }
+       die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
    }
 
    PQclear(res);
@@ -823,81 +914,104 @@ install_extension_if_not_exists(PGconn *conn, const char *extname)
 }
 
 /*
- * Initialize bdr extension (if not already initialized).
- *
- * Should have similar logic as bdr_maintain_schema in bdr.c.
+ * Validate that BDR extension is installed on remote node
+ * and that there is at least one BDR node entry present.
  */
 static void
-initialize_bdr(PGconn *conn)
+validate_remote_node(PGconn *conn)
 {
-   install_extension_if_not_exists(conn, "btree_gist");
-   install_extension_if_not_exists(conn,"bdr");
-}
-#endif
+   PGresult   *res;
+   PQExpBuffer query = createPQExpBuffer();
 
-/*
- * Initialize new remote identifiers to specific position.
- */
-static void
-initialize_replication_identifiers(char *remote_lsn)
-{
-   int              i;
-   PGresult        *res;
+   if (!extension_exists(conn, "bdr"))
+       die(_("The BDR extension must be installed on remote node.\n"));
 
-   /* Remove replication identifiers */
-   res = PQexec(local_conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+#ifdef BUILDING_BDR
+   res = PQexec(conn, "SELECT 1 FROM bdr.bdr_nodes;");
    if (PQresultStatus(res) != PGRES_TUPLES_OK)
    {
        PQclear(res);
-       die(_("Could not remove replication identifier: %s\n"), PQerrorMessage(local_conn));
+       die(_("Could fetch BDR info: %s\n"), PQerrorMessage(conn));
    }
 
-   /* Initialize new replication identifiers */
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       char        remote_ident[256];
-       Oid         dboid;
-       RemoteInfo  *ri;
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-       PQExpBuffer conninfo = createPQExpBuffer();
+   if (PQntuples(res) < 1)
+       die(_("The remote node is not configured as a BDR node.\n"));
 
-       if (!cfg || !cfg->is_valid)
-           continue;
+   PQclear(res);
+#endif
 
-       printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
-       remote_conn = PQconnectdb(conninfo->data);
-       destroyPQExpBuffer(conninfo);
+   destroyPQExpBuffer(query);
+}
 
-       if (PQstatus(remote_conn) != CONNECTION_OK)
-       {
-           die(_("Could not connect to the remote server: %s\n"),
-                       PQerrorMessage(remote_conn));
-       }
 
-       ri = get_remote_info(remote_conn, cfg->dsn);
-       dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
+/*
+ * Insert node entry for local node to the remote's bdr_nodes.
+ */
+void
+initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+                     char *remote_connstr)
+{
+   PQExpBuffer     query = createPQExpBuffer();
+   PGresult       *res;
 
-       PQfinish(remote_conn);
-       remote_conn = NULL;
+   printfPQExpBuffer(query, "INSERT INTO bdr.bdr_nodes"
+                            " (node_status, node_sysid, node_timeline,"
+                            "  node_dboid, node_init_from_dsn)"
+                            " VALUES ('c', '"UINT64_FORMAT"', %u, %u, %s);",
+                     ni->local_sysid, ni->local_tlid, dboid,
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+   res = PQexec(conn, query->data);
+
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       PQclear(res);
+       die(_("Failed to insert row into bdr.bdr_nodes: %s\n"), PQerrorMessage(conn));
+   }
+
+   PQclear(res);
+   destroyPQExpBuffer(query);
+}
+
+/*
+ * Clean all the data that was copied from remote node but we don't
+ * want it here (currently shared security labels and replication identifiers).
+ */
+static void
+remove_unwanted_data(PGconn *conn, char *dbname)
+{
+   PGresult       *res;
 
-       snprintf(remote_ident, sizeof(remote_ident),
-               BDR_NODE_ID_FORMAT,
-               ri->sysid, ri->tlid, ri->dboid, dboid,
-               "");
+   /* Remove any BDR security labels. */
+   res = PQexec(conn, "DELETE FROM pg_catalog.pg_shseclabel WHERE provider = 'bdr';");
 
-       create_replication_identifier(local_conn, remote_ident,
-                                     cfg->init_replica ? remote_lsn : NULL);
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       PQclear(res);
+       die(_("Could not update security label: %s\n"), PQerrorMessage(conn));
+   }
+
+   /* Remove replication identifiers. */
+   res = PQexec(conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       PQclear(res);
+       die(_("Could not remove existing replication identifiers: %s\n"), PQerrorMessage(conn));
    }
+   PQclear(res);
 }
 
 /*
- * Create local replication identifier
+ * Initialize new remote identifier to specific position.
  */
 static void
-create_replication_identifier(PGconn *conn, const char *remote_ident, char *remote_lsn)
+initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn)
 {
-   PQExpBuffer query = createPQExpBuffer();
    PGresult   *res;
+   char        remote_ident[256];
+   PQExpBuffer query = createPQExpBuffer();
+
+   snprintf(remote_ident, sizeof(remote_ident), BDR_NODE_ID_FORMAT,
+               ni->remote_sysid, ni->remote_tlid, dboid, dboid, "");
 
    printfPQExpBuffer(query, "SELECT "RIINTERFACE_PREFIX"replication_identifier_create('%s')",
                     remote_ident);
@@ -937,76 +1051,70 @@ create_replication_identifier(PGconn *conn, const char *remote_ident, char *remo
  * state through physical replay.
  */
 static char *
-create_restore_point(char *remote_connstr)
+create_restore_point(PGconn *conn, char *restore_point_name)
 {
    PQExpBuffer  query = createPQExpBuffer();
    PGresult    *res;
    char        *remote_lsn = NULL;
 
-   remote_conn = PQconnectdb(remote_connstr);
-   if (PQstatus(remote_conn) != CONNECTION_OK)
-   {
-       die(_("Could not connect to the remote server: %s\n"),
-                   PQerrorMessage(remote_conn));
-   }
-
-   printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", NameStr(restore_point_name));
-   res = PQexec(remote_conn, query->data);
+   printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", restore_point_name);
+   res = PQexec(conn, query->data);
    if (PQresultStatus(res) != PGRES_TUPLES_OK)
    {
-       die(_("Could not create restore point \"%s\": status %s: %s\n"),
-            query->data,
+       die(_("Could not create restore point, status %s: %s\n"),
             PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
    }
    remote_lsn = pstrdup(PQgetvalue(res, 0, 0));
 
    PQclear(res);
-   PQfinish(remote_conn);
-   remote_conn = NULL;
    destroyPQExpBuffer(query);
 
    return remote_lsn;
 }
 
-static char *
-detect_local_conninfo(void)
-{
-   int i;
 
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
-
-       if (!cfg || !cfg->is_valid || !cfg->init_replica ||
-           !cfg->replica_local_dsn)
-           continue;
-
-       return pg_strdup(cfg->replica_local_dsn);
-   }
-
-   return NULL;
-}
-
-static char *
-detect_remote_conninfo(void)
+static void
+bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr)
 {
-   int i;
+   PQExpBuffer  query = createPQExpBuffer();
+   PGresult    *res;
 
-   for (i = 0; i < bdr_connection_config_count; i++)
-   {
-       BdrConnectionConfig *cfg = bdr_connection_configs[i];
+   /* Install required extensions if needed. */
+   if (!extension_exists(conn, "btree_gist"))
+       install_extension(conn, "btree_gist");
+   if (!extension_exists(conn, "bdr"))
+       install_extension(conn, "bdr");
 
-       if (!cfg || !cfg->is_valid || !cfg->init_replica)
-           continue;
+   /* Add the node to the cluster. */
+#ifdef BUILDING_BDR
+   printfPQExpBuffer(query, "SELECT bdr.bdr_group_join(%s, %s);",
+                     PQescapeLiteral(conn, local_connstr, strlen(local_connstr)),
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+#else
+   printfPQExpBuffer(query, "SELECT bdr.bdr_subscribe(%s, %s);",
+                     PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)),
+                     PQescapeLiteral(conn, local_connstr, strlen(local_connstr)));
+#endif
 
-       return pg_strdup(cfg->dsn);
+   res = PQexec(conn, query->data);
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+   {
+       die(_("Could not add local node to cluster, status %s: %s\n"),
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
    }
 
-   return NULL;
+   PQclear(res);
+   destroyPQExpBuffer(query);
 }
 
+/*
+ * Build connection string from individual parameter.
+ *
+ * This function also handles case where full connection string was
+ * specified instead of dbname.
+ */
 char *
-get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
+get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser)
 {
    char        *ret;
    int         argcount = 4;   /* dbname, host, user, port */
@@ -1053,10 +1161,6 @@ get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
    {
        keywords = pg_malloc0((argcount + 1) * sizeof(*keywords));
        values = pg_malloc0((argcount + 1) * sizeof(*values));
-
-       keywords[i] = "dbname";
-       values[i] = dbname == NULL ? "postgres" : dbname;
-       i++;
    }
 
    if (dbhost)
@@ -1078,7 +1182,7 @@ get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
        i++;
    }
 
-   ret = PQconninfoParams_to_conninfo(keywords, values);
+   ret = PQconninfoParamsToConnstr(keywords, values);
 
    /* Connection ok! */
    pg_free(values);
@@ -1137,22 +1241,38 @@ WriteRecoveryConf(PQExpBuffer contents)
    fclose(cf);
 }
 
+/*
+ * Copy file to data
+ */
+static void
+CopyConfFile(char *fromfile, char *tofile)
+{
+   char        filename[MAXPGPATH];
+
+   sprintf(filename, "%s/%s", data_dir, tofile);
+
+   print_msg(VERBOSITY_DEBUG, _("Copying \"%s\" to \"%s\".\n"),
+             fromfile, filename);
+   copy_file(fromfile, filename);
+}
+
+
 /*
  * Convert PQconninfoOption array into conninfo string
  */
 static char *
-PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values)
+PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values)
 {
    PQExpBuffer  retbuf = createPQExpBuffer();
    char        *ret;
    int          i = 0;
 
-   while (keywords[i])
+   for (i = 0; keywords[i] != NULL; i++)
    {
-       char *tmpval = escapeConninfoValue(values[i]);
-       appendPQExpBuffer(retbuf, "%s = '%s' ", keywords[i], tmpval);
-       pg_free(tmpval);
-       i++;
+       if (i > 0)
+           appendPQExpBufferChar(retbuf, ' ');
+       appendPQExpBuffer(retbuf, "%s=", keywords[i]);
+       appendPQExpBufferConnstrValue(retbuf, values[i]);
    }
 
    ret = pg_strdup(retbuf->data);
@@ -1164,371 +1284,130 @@ PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * v
 /*
  * Escape connection info value
  */
-static char *
-escapeConninfoValue(const char *val)
+static void
+appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str)
 {
-   int i, j;
-   char *ret = pg_malloc(strlen(val) * 2 + 1);
+   const char *s;
+   bool        needquotes;
 
-   j = 0;
-   for (i = 0; i < strlen(val); i++)
+   /*
+    * If the string consists entirely of plain ASCII characters, no need to
+    * quote it. This is quite conservative, but better safe than sorry.
+    */
+   needquotes = false;
+   for (s = str; *s; s++)
    {
-       switch (val[i])
+       if (!((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z') ||
+             (*s >= '0' && *s <= '9') || *s == '_' || *s == '.'))
        {
-           case '\\':
-           case '\'':
-               ret[j++] = '\\';
-           default:
-               break;
+           needquotes = true;
+           break;
        }
-
-       ret[j++] = val[i];
    }
 
-   ret[j] = '\0';
-
-   return ret;
-}
-
-
-/*
- * Taken from adt/bool.c
- *
- * Try to interpret value as boolean value.  Valid values are: true,
- * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof.
- * If the string parses okay, return true, else false.
- * If okay and result is not NULL, return the value in *result.
- */
-static bool
-parse_bool(const char *value, bool *result)
-{
-   return parse_bool_with_len(value, strlen(value), result);
-}
-
-static bool
-parse_bool_with_len(const char *value, size_t len, bool *result)
-{
-   switch (*value)
+   if (needquotes)
    {
-       case 't':
-       case 'T':
-           if (pg_strncasecmp(value, "true", len) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case 'f':
-       case 'F':
-           if (pg_strncasecmp(value, "false", len) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case 'y':
-       case 'Y':
-           if (pg_strncasecmp(value, "yes", len) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case 'n':
-       case 'N':
-           if (pg_strncasecmp(value, "no", len) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case 'o':
-       case 'O':
-           /* 'o' is not unique enough */
-           if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       case '1':
-           if (len == 1)
-           {
-               if (result)
-                   *result = true;
-               return true;
-           }
-           break;
-       case '0':
-           if (len == 1)
-           {
-               if (result)
-                   *result = false;
-               return true;
-           }
-           break;
-       default:
-           break;
-   }
-
-   if (result)
-       *result = false;        /* suppress compiler warning */
-   return false;
-}
-
-/*
- * Remove leading and trailing whitespace from the string,
- * does not change input
- */
-static char *
-trimwhitespace(const char *str)
-{
-   const char *end;
-   char *res;
-   size_t len;
-
-   while(isspace(*str))
-       str++;
-
-   if(*str == 0)
-       return NULL;
-
-   end = str + strlen(str) - 1;
-   while(end > str && isspace(*end))
-       end--;
-
-   len = end-str;
-   if (!len)
-       return NULL;
-
-   len++;
-   res = pg_malloc(len+1);
-   memcpy(res, str, len);
-   res[len] = '\0';
-
-   return res;
-}
-
-/*
- * Split guc list paramenter into array
- * Note that this is not 100% compatible with that is in core
- * but seems good enough for our purposes
- */
-static char    **
-split_list_guc(char *str, size_t *count)
-{
-   char    **ret = NULL;
-   char     *t = strtok (str, ",");
-   size_t    i = 0;
-
-   while (t) {
-       ret = realloc(ret, sizeof(char*)* ++i);
-
-       if (ret == NULL)
-           die(_("Out of memory\n"));
-
-       t = trimwhitespace(t);
-       if (!t)
-           die(_("Bad input for list: %s\n"), str);
-
-       ret[i-1] = t;
+       appendPQExpBufferChar(buf, '\'');
+       while (*str)
+       {
+           /* ' and \ must be escaped by to \' and \\ */
+           if (*str == '\'' || *str == '\\')
+               appendPQExpBufferChar(buf, '\\');
 
-       t = strtok(NULL, ",");
+           appendPQExpBufferChar(buf, *str);
+           str++;
+       }
+       appendPQExpBufferChar(buf, '\'');
    }
-
-   *count = i;
-   return ret;
+   else
+       appendPQExpBufferStr(buf, str);
 }
 
 
 /*
  * Find the pgport and try a connection
- *
- * Based on pg_ctl.c:test_postmaster_connection
  */
-static bool
-wait_postmaster_connection(void)
+static void
+wait_postmaster_connection(const char *connstr)
 {
    PGPing      res;
-   long        pm_pid = 0;
-   char        connstr[MAXPGPATH * 2 + 256];
+   long        pmpid = 0;
 
-   connstr[0] = '\0';
+   print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to accept connections ...");
 
+   /* First wait for Postmaster to come up. */
    for (;;)
    {
-       /* Do we need a connection string? */
-       if (connstr[0] == '\0')
-       {
-           /*----------
-            * The number of lines in postmaster.pid tells us several things:
-            *
-            * # of lines
-            *      0   lock file created but status not written
-            *      2   pre-9.1 server, shared memory not created
-            *      3   pre-9.1 server, shared memory created
-            *      5   9.1+ server, ports not opened
-            *      6   9.1+ server, shared memory not created
-            *      7   9.1+ server, shared memory created
-            *
-            * If we see less than 6 lines in postmaster.pid, just keep
-            * waiting.
-            *----------
-            */
-           char      **optlines;
-
-           /* Try to read the postmaster.pid file */
-           if ((optlines = readfile(pid_file)) != NULL &&
-               optlines[0] != NULL &&
-               optlines[1] != NULL &&
-               optlines[2] != NULL &&
-               optlines[3] != NULL &&
-               optlines[4] != NULL &&
-               optlines[5] != NULL)
-           {
-               /* File is complete enough for us, parse it */
-               long        pmpid;
-               time_t      pmstart;
-
-               /*
-                * Make sanity checks.  If it's for a standalone backend
-                * (negative PID), or the recorded start time is before
-                * pg_ctl started, then either we are looking at the wrong
-                * data directory, or this is a pre-existing pidfile that
-                * hasn't (yet?) been overwritten by our child postmaster.
-                * Allow 2 seconds slop for possible cross-process clock
-                * skew.
-                */
-               pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
-               pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
-               if (pmpid > 0 || pmstart > start_time - 3)
-               {
-                   /*
-                    * OK, seems to be a valid pidfile from our child.
-                    */
-                   int         portnum;
-                   char       *sockdir;
-                   char       *hostaddr;
-                   char        host_str[MAXPGPATH];
-
-                   pm_pid = pmpid;
-
-                   /*
-                    * Extract port number and host string to use. Prefer
-                    * using Unix socket if available.
-                    */
-                   portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
-                   sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
-                   hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
-
-                   /*
-                    * While unix_socket_directories can accept relative
-                    * directories, libpq's host parameter must have a
-                    * leading slash to indicate a socket directory.  So,
-                    * ignore sockdir if it's relative, and try to use TCP
-                    * instead.
-                    */
-                   if (sockdir[0] == '/')
-                       strlcpy(host_str, sockdir, sizeof(host_str));
-                   else
-                       strlcpy(host_str, hostaddr, sizeof(host_str));
-
-                   /* remove trailing newline */
-                   if (strchr(host_str, '\n') != NULL)
-                       *strchr(host_str, '\n') = '\0';
-
-                   /* Fail if couldn't get either sockdir or host addr */
-                   if (host_str[0] == '\0')
-                   {
-                       fprintf(stderr, _("Relative socket directory is not supported\n"));
-                       return false;
-                   }
-
-                   /* If postmaster is listening on "*", use localhost */
-                   if (strcmp(host_str, "*") == 0)
-                       strcpy(host_str, "localhost");
-
-                   /*
-                    * We need to set connect_timeout otherwise on Windows
-                    * the Service Control Manager (SCM) will probably
-                    * timeout first.
-                    */
-                   snprintf(connstr, sizeof(connstr),
-                   "dbname=postgres port=%d host='%s' connect_timeout=5",
-                            portnum, host_str);
-               }
-           }
+       if ((pmpid = get_pgpid()) != 0 &&
+           postmaster_is_alive((pid_t) pmpid))
+           break;
 
-           /*
-            * Free the results of readfile.
-            *
-            * This is safe to call even if optlines is NULL.
-            */
-           free_readfile(optlines);
-       }
+       pg_usleep(1000000);     /* 1 sec */
+       print_msg(VERBOSITY_VERBOSE, ".");
+   }
 
-       /* If we have a connection string, ping the server */
-       if (connstr[0] != '\0')
-       {
-           res = PQping(connstr);
-           if (res == PQPING_OK)
-           {
-               break;
-           }
-           else if (res == PQPING_NO_ATTEMPT)
-               return false;
-       }
+   /* Now wait for Postmaster to either accept connections or die. */
+   for (;;)
+   {
+       res = PQping(connstr);
+       if (res == PQPING_OK)
+           break;
+       else if (res == PQPING_NO_ATTEMPT)
+           break;
 
        /*
-        * If we've been able to identify the child postmaster's PID, check
-        * the process is still alive.  This covers cases where the postmaster
-        * successfully created the pidfile but then crashed without removing
-        * it.
+        * Check if the process is still alive. This covers cases where the
+        * postmaster successfully created the pidfile but then crashed without
+        * removing it.
         */
-       if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
-           return false;
+       if (!postmaster_is_alive((pid_t) pmpid))
+           break;
 
-       /* No response, or startup still in process; wait */
+       /* No response; wait */
        pg_usleep(1000000);     /* 1 sec */
-       print_msg(".");
+       print_msg(VERBOSITY_VERBOSE, ".");
    }
 
-   return true;
+   print_msg(VERBOSITY_VERBOSE, "\n");
 }
 
 /*
  * Wait for postmaster to die
  */
 static void
-wait_postgres_shutdown(void)
+wait_postmaster_shutdown(void)
 {
    long pid;
 
+   print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to shutdown ...");
+
    for (;;)
    {
        if ((pid = get_pgpid()) != 0)
        {
            pg_usleep(1000000);     /* 1 sec */
-           print_msg(".");
+           print_msg(VERBOSITY_NORMAL, ".");
        }
        else
            break;
    }
+
+   print_msg(VERBOSITY_VERBOSE, "\n");
+}
+
+static bool
+file_exists(const char *path)
+{
+   struct stat statbuf;
+
+   if (stat(path, &statbuf) != 0)
+       return false;
+
+   return true;
 }
 
 static bool
-is_pg_dir(char *path)
+is_pg_dir(const char *path)
 {
    struct stat statbuf;
    char        version_file[MAXPGPATH];
@@ -1545,6 +1424,63 @@ is_pg_dir(char *path)
    return true;
 }
 
+/*
+ * copy one file
+ */
+static void
+copy_file(char *fromfile, char *tofile)
+{
+   char       *buffer;
+   int         srcfd;
+   int         dstfd;
+   int         nbytes;
+   off_t       offset;
+
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+   buffer = malloc(COPY_BUF_SIZE);
+
+   /*
+    * Open the files
+    */
+   srcfd = open(fromfile, O_RDONLY | PG_BINARY, 0);
+   if (srcfd < 0)
+       die(_("could not open file \"%s\""), fromfile);
+
+   dstfd = open(tofile, O_RDWR | PG_BINARY,
+                             S_IRUSR | S_IWUSR);
+   if (dstfd < 0)
+       die(_("could not create file \"%s\""), tofile);
+
+   /*
+    * Do the data copying.
+    */
+   for (offset = 0;; offset += nbytes)
+   {
+       nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+       if (nbytes < 0)
+           die(_("could not read file \"%s\""), fromfile);
+       if (nbytes == 0)
+           break;
+       errno = 0;
+       if ((int) write(dstfd, buffer, nbytes) != nbytes)
+       {
+           /* if write didn't set errno, assume problem is no disk space */
+           if (errno == 0)
+               errno = ENOSPC;
+           die(_("could not write to file \"%s\""), tofile);
+       }
+   }
+
+   if (close(dstfd))
+       die(_("could not close file \"%s\""), tofile);
+
+   /* we don't care about errors here */
+   close(srcfd);
+
+   free(buffer);
+}
+
 /*
  * Utility functions taken from pg_ctl
  */
@@ -1622,112 +1558,3 @@ get_pgpid(void)
    fclose(pidf);
    return pid;
 }
-
-/*
- * get the lines from a text file - return NULL if file can't be opened
- */
-static char **
-readfile(const char *path)
-{
-   int         fd;
-   int         nlines;
-   char      **result;
-   char       *buffer;
-   char       *linebegin;
-   int         i;
-   int         n;
-   int         len;
-   struct stat statbuf;
-
-   /*
-    * Slurp the file into memory.
-    *
-    * The file can change concurrently, so we read the whole file into memory
-    * with a single read() call. That's not guaranteed to get an atomic
-    * snapshot, but in practice, for a small file, it's close enough for the
-    * current use.
-    */
-   fd = open(path, O_RDONLY | PG_BINARY, 0);
-   if (fd < 0)
-       return NULL;
-   if (fstat(fd, &statbuf) < 0)
-   {
-       close(fd);
-       return NULL;
-   }
-   if (statbuf.st_size == 0)
-   {
-       /* empty file */
-       close(fd);
-       result = (char **) pg_malloc(sizeof(char *));
-       *result = NULL;
-       return result;
-   }
-   buffer = pg_malloc(statbuf.st_size + 1);
-
-   len = read(fd, buffer, statbuf.st_size + 1);
-   close(fd);
-   if (len != statbuf.st_size)
-   {
-       /* oops, the file size changed between fstat and read */
-       free(buffer);
-       return NULL;
-   }
-
-   /*
-    * Count newlines. We expect there to be a newline after each full line,
-    * including one at the end of file. If there isn't a newline at the end,
-    * any characters after the last newline will be ignored.
-    */
-   nlines = 0;
-   for (i = 0; i < len; i++)
-   {
-       if (buffer[i] == '\n')
-           nlines++;
-   }
-
-   /* set up the result buffer */
-   result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
-
-   /* now split the buffer into lines */
-   linebegin = buffer;
-   n = 0;
-   for (i = 0; i < len; i++)
-   {
-       if (buffer[i] == '\n')
-       {
-           int         slen = &buffer[i] - linebegin + 1;
-           char       *linebuf = pg_malloc(slen + 1);
-
-           memcpy(linebuf, linebegin, slen);
-           linebuf[slen] = '\0';
-           result[n++] = linebuf;
-           linebegin = &buffer[i + 1];
-       }
-   }
-   result[n] = NULL;
-
-   free(buffer);
-
-   return result;
-}
-
-/*
- * Free memory allocated for optlines through readfile()
- */
-void
-free_readfile(char **optlines)
-{
-   char       *curr_line = NULL;
-   int         i = 0;
-
-   if (!optlines)
-       return;
-
-   while ((curr_line = optlines[i++]))
-       free(curr_line);
-
-   free(optlines);
-
-   return;
-}
index b5a5931bd612395040c7604e0f113f44677267eb..36e50c1cf15ff4e4a74f0c90deabd819b3b79ee7 100644 (file)
 #include "storage/shmem.h"
 
 #include "utils/builtins.h"
+#include "utils/memutils.h"
 #include "utils/pg_lsn.h"
 #include "utils/syscache.h"
 
-char *bdr_temp_dump_directory = NULL;
-bool bdr_init_from_basedump = false;
-
-static void bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot);
-
-static void bdr_catchup_to_lsn(int cfg_index,
-                              XLogRecPtr target_lsn);
-
-/*
- * Search BdrWorkerCtl for a worker in dbname with init_replica set and
- * return it. The first worker found is returned (previous code should've
- * ensured there can only be one). If no match is found, return null.
- *
- * Must be called with at least a share lock on BdrWorkerCtl->lock
- *
- */
-static BdrWorker*
-find_init_replica_worker(Name dbname)
-{
-   int off;
-
-   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
-   /* Check whether one of our connections has init_replica set */
-   for (off = 0; off < bdr_max_workers; off++)
-   {
-       BdrApplyWorker         *aw;
-       BdrConnectionConfig    *cfg;
 
-       if (BdrWorkerCtl->slots[off].worker_type != BDR_WORKER_APPLY)
-           continue;
+char *bdr_temp_dump_directory = NULL;
 
-       aw = &BdrWorkerCtl->slots[off].data.apply;
-       cfg = bdr_connection_configs[aw->connection_config_idx];
+static void bdr_init_exec_dump_restore(BDRNodeInfo *node,
+                                      char *snapshot);
 
-       if ((strcmp(cfg->dbname, NameStr(*dbname)) == 0)
-           && cfg->init_replica)
-       {
-           return &BdrWorkerCtl->slots[off];
-       }
-   }
-   return NULL;
-}
+static void bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn);
 
 /*
- * Get this node's status value from the remote's bdr.bdr_nodes table
- * and return it.
+ * Make sure remote node has BDR activated (insert the security label).
  *
- * If no row is found, '\0' is returned.
+ * This is only needed for UDR.
  */
-static char
-bdr_get_remote_status(PGconn *pgconn)
+static void
+bdr_remote_activate(PGconn *pgconn)
 {
    PGresult           *res;
-   char                status;
-   Oid                 param_types[] = {TEXTOID, OIDOID, OIDOID};
-   const char         *param_values[3];
-   /* Needs to fit max length of UINT64_FORMAT */
-   char                sysid_str[33];
-   char                tlid_str[33];
-   char                mydatabaseid_str[33];
-
-   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
-            GetSystemIdentifier());
-   sysid_str[sizeof(sysid_str)-1] = '\0';
-
-   snprintf(tlid_str, sizeof(tlid_str), "%u",
-            ThisTimeLineID);
-   tlid_str[sizeof(tlid_str)-1] = '\0';
 
-   snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
-            MyDatabaseId);
-   mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
-   param_values[0] = sysid_str;
-   param_values[1] = tlid_str;
-   param_values[2] = mydatabaseid_str;
-
-   res = PQexecParams(pgconn,
-                      "SELECT node_status FROM bdr.bdr_nodes "
-                      "WHERE node_sysid = $1 AND node_timeline = $2 "
-                      "AND node_dboid = $3 "
-                      "FOR UPDATE",
-                      3, param_types, param_values, NULL, NULL, 0);
+   res = PQexec(pgconn, "SELECT bdr.internal_update_seclabel()");
    if (PQresultStatus(res) != PGRES_TUPLES_OK)
    {
-       elog(FATAL, "bdr: Failed to get remote status during bdr init: state %s: %s\n",
+       elog(FATAL, "bdr: Failed to activate remote node during bdr init: state %s: %s\n",
             PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
    }
-   if (PQntuples(res) == 0)
-       /* No row found on remote, we're starting from scratch */
-       status = '\0';
-   else
-   {
-       char *status_str = PQgetvalue(res, 0, 0);
-       Assert(strlen(status_str) == 1);
-       status = status_str[0];
-   }
    PQclear(res);
-
-   return status;
-}
-
-/*
- * Update/delete/insert in bdr.bdr_nodes to ensure that the bdr.bdr_nodes row
- * for this worker's node ID matches the passed status before returning.
- *
- * The special case '\0' means "remove the row".
- *
- * No fancy upsert games are required here because we ensure that only one
- * worker can be initing any one database, and that node IDs are unique across
- * a group of BDR nodes.
- */
-static char
-bdr_set_remote_status(PGconn *pgconn, const char status,
-                     const char prev_status)
-{
-   PGresult           *res;
-   char               *status_str;
-   const uint64        sysid = GetSystemIdentifier();
-   /* Needs to fit max length of UINT64_FORMAT */
-   char                sysid_str[33];
-   char                tlid_str[33];
-   char                mydatabaseid_str[33];
-
-   if (status == prev_status)
-       /* No action required (we could check the remote, but meh) */
-       return status;
-
-   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
-            GetSystemIdentifier());
-   sysid_str[sizeof(sysid_str)-1] = '\0';
-
-   snprintf(tlid_str, sizeof(tlid_str), "%u",
-            ThisTimeLineID);
-   tlid_str[sizeof(tlid_str)-1] = '\0';
-
-   snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
-            MyDatabaseId);
-   mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
-   if (status == '\0')
-   {
-       Oid         param_types[] = {TEXTOID, OIDOID, OIDOID};
-       const char *param_values[3];
-       char        new_status;
-
-       param_values[0] = sysid_str;
-       param_values[1] = tlid_str;
-       param_values[2] = mydatabaseid_str;
-
-       res = PQexecParams(pgconn,
-                          "DELETE FROM bdr.bdr_nodes WHERE node_sysid = $1"
-                          " AND node_timeline = $2 AND node_dboid = $3 "
-                          "RETURNING node_status",
-                          3, param_types, param_values, NULL, NULL, 0);
-
-       if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       {
-           elog(FATAL, "bdr: Failed to delete row from bdr_nodes: status %s: %s\n",
-                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
-       }
-       if (PQntuples(res) == 0)
-       {
-           /*
-            * If prev_status was '\0' we wouldn't be here, so we should've
-            * got a returned value.
-            */
-           elog(FATAL, "bdr: bdr.bdr_nodes row for sysid=" UINT64_FORMAT
-                       ", tlid=%u, dboid=%u missing, expected row with status=%c",
-                sysid, ThisTimeLineID, MyDatabaseId, (int)prev_status);
-       }
-       status_str = PQgetvalue(res, 0, 0);
-       Assert(strlen(status_str) == 1);
-       new_status = status_str[0];
-
-       if (new_status != prev_status)
-       {
-           elog(FATAL, "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
-                       ", timeline=%u, dboid=%u had status=%c, expected status=%c",
-                sysid, ThisTimeLineID, MyDatabaseId, (int) new_status,
-                (int) prev_status);
-       }
-
-       PQclear(res);
-   }
-   else
-   {
-       Oid         param_types[] = {CHAROID, TEXTOID, OIDOID, OIDOID};
-       const char *param_values[4];
-       char        new_status;
-       char        status_str[2];
-
-       snprintf(status_str, 2, "%c", (int)status);
-       param_values[0] = status_str;
-       param_values[1] = sysid_str;
-       param_values[2] = tlid_str;
-       param_values[3] = mydatabaseid_str;
-
-       res = PQexecParams(pgconn,
-                          "UPDATE bdr.bdr_nodes "
-                          "SET node_status = $1 "
-                          "WHERE node_sysid = $2 AND node_timeline = $3 "
-                          "AND node_dboid = $4 "
-                          "RETURNING ("
-                          "  SELECT node_status FROM bdr.bdr_nodes "
-                          "  WHERE node_sysid = $2 AND node_timeline = $3 "
-                          "  AND node_dboid = $4"
-                          ")",
-                          4, param_types, param_values, NULL, NULL, 0);
-
-       if (PQresultStatus(res) != PGRES_TUPLES_OK)
-       {
-           elog(FATAL,
-                "bdr: Failed to update bdr.nodes row: status %s: %s\n",
-                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
-       }
-       if (PQntuples(res) != 0)
-       {
-           char *new_status_str;
-           /* Updated a row */
-           new_status_str = PQgetvalue(res, 0, 0);
-           Assert(strlen(status_str) == 1);
-           new_status = new_status_str[0];
-           if (new_status != prev_status)
-           {
-               elog(FATAL,
-                    "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
-                    ", timeline=%u, dboid=%u had status=%c, expected status=%c",
-                    sysid, ThisTimeLineID, MyDatabaseId, (int)new_status,
-                    (int)prev_status);
-           }
-
-           PQclear(res);
-       }
-       else
-       {
-           /* No rows affected, insert a new row instead. We re-use the previous
-            * query parameters. */
-           PQclear(res);
-           res = PQexecParams(pgconn,
-                              "INSERT INTO bdr.bdr_nodes"
-                              " (node_status, node_sysid, node_timeline, node_dboid)"
-                              " VALUES ($1, $2, $3, $4);",
-                              4, param_types, param_values, NULL, NULL, 0);
-
-           if (PQresultStatus(res) != PGRES_COMMAND_OK)
-           {
-               elog(FATAL,
-                    "bdr: Failed to insert row into bdr.bdr_nodes: status %s: %s\n",
-                    PQresStatus(PQresultStatus(res)),
-                    PQresultErrorMessage(res));
-           }
-           PQclear(res);
-       }
-   }
-
-   return status;
 }
 
 static XLogRecPtr
@@ -362,6 +131,8 @@ bdr_get_remote_ext_version(PGconn *pgconn, char **default_version,
    else if (PQntuples(res) == 0)
    {
        /* bdr ext is not known to Pg at all */
+       *default_version = NULL;
+       *installed_version = NULL;
    }
    else
    {
@@ -406,97 +177,6 @@ bdr_ensure_ext_installed(PGconn *pgconn)
    pfree(installed_version);
 }
 
-
-static void
-bdr_drop_slot_and_replication_identifier(BdrConnectionConfig *cfg)
-{
-
-   PGconn     *streamConn;
-   RepNodeId   replication_identifier;
-   NameData    slot_name;
-   TimeLineID  timeline;
-   Oid         dboid;
-   uint64      sysid;
-   PGresult   *res;
-   StringInfoData query;
-   char       *sqlstate;
-   NameData    appname;
-   char       *remote_ident;
-
-
-   elog(DEBUG1, "bdr %s: Dropping slot and local ident from connection %s",
-        cfg->dbname, cfg->name);
-
-   snprintf(NameStr(appname), NAMEDATALEN, "slot drop");
-   (NameStr(appname))[NAMEDATALEN-1] = '\0';
-
-   /* Establish BDR conn and IDENTIFY_SYSTEM */
-   streamConn = bdr_connect(
-       cfg->dsn, &appname,
-       &sysid, &timeline, &dboid
-       );
-
-   bdr_build_ident_and_slotname(sysid, timeline, dboid,
-           &remote_ident, &slot_name);
-
-
-   StartTransactionCommand();
-   replication_identifier = GetReplicationIdentifier(remote_ident, true);
-
-   pfree(remote_ident);
-
-   if (OidIsValid(replication_identifier))
-   {
-       /* Local replication identifier exists and must be dropped. */
-       elog(DEBUG2, "bdr %s: Deleting local replication identifier %hu",
-            cfg->dbname, replication_identifier);
-       DropReplicationIdentifier(replication_identifier);
-       /*
-        * We should CHECKPOINT after this to make sure replication
-        * identifier state gets flushed.
-        */
-       RequestCheckpoint(CHECKPOINT_IMMEDIATE|CHECKPOINT_FORCE);
-   }
-   else
-   {
-       elog(DEBUG2, "bdr %s: No local replication identifier to delete",
-            cfg->dbname);
-   }
-
-   /*
-    * Remove corresponding remote slot if it exists. We can't query
-    * whether it exists or not silently over the replication protocol,
-    * so we just try it and cope if it's missing.
-    */
-   initStringInfo(&query);
-   appendStringInfo(&query, "DROP_REPLICATION_SLOT %s", NameStr(slot_name));
-   res = PQexec(streamConn, query.data);
-   if (PQresultStatus(res) == PGRES_COMMAND_OK)
-   {
-       elog(DEBUG2, "bdr %s: remote replication slot %s deleted",
-            cfg->dbname, NameStr(slot_name));
-   }
-   else
-   {
-       /* SQLSTATE 42704 expected; others are error conditions */
-       sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
-       if (strcmp(sqlstate, "42704") != 0)
-       {
-           ereport(ERROR,
-                   (errmsg("'DROP_REPLICATION_SLOT %s' on bdr connection %s failed with sqlstate %s: %s",
-                           NameStr(slot_name), cfg->name,
-                           sqlstate,PQresultErrorMessage(res))));
-       }
-       else
-       {
-           elog(DEBUG2, "bdr %s: No slot to delete", cfg->dbname);
-       }
-   }
-   CommitTransactionCommand();
-   PQclear(res);
-   PQfinish(streamConn);
-}
-
 static void
 bdr_init_replica_cleanup_tmpdir(int errcode, Datum tmpdir)
 {
@@ -510,11 +190,14 @@ bdr_init_replica_cleanup_tmpdir(int errcode, Datum tmpdir)
 
 /*
  * Use a script to copy the contents of a remote node using pg_dump and apply
- * it to the local node. Runs during slot creation to bring up a new logical
- * replica from an existing node.
+ * it to the local node. Runs during node join creation to bring up a new
+ * logical replica from an existing node. The remote dump is taken from the
+ * start position of a slot on the remote end to ensure that we never replay
+ * changes included in the dump and never miss changes.
  */
 static void
-bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
+bdr_init_exec_dump_restore(BDRNodeInfo *node,
+                          char *snapshot)
 {
 #ifndef WIN32
    pid_t pid;
@@ -564,22 +247,21 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
 
 
    appendStringInfo(&origin_dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica dump'",
-                    cfg->dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica dump'",
+                    node->init_from_dsn, BDR_LOCALID_FORMAT_ARGS);
 
-   if (cfg->replica_local_dsn == NULL)
-       elog(FATAL, "bdr init_replica: no replica_local_dsn specified");
    appendStringInfo(&local_dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica restore'",
-                    cfg->replica_local_dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica restore'",
+                    node->local_dsn, BDR_LOCALID_FORMAT_ARGS);
 
    /*
     * Suppress replication of changes applied via pg_restore back to
     * the local node.
     *
-    * XXX DYNCONF: This should PQconninfoParse, modify the options keyword or
-    * add it, and reconstruct the string using the functions from pg_dumpall
-    * (also to be used for init_copy). This is a hack.
+    * TODO: This should PQconninfoParse, modify the options keyword or add
+    * it, and reconstruct the string using the functions from pg_dumpall
+    * (also to be used for init_copy). Simply appending the options
+    * instead is a bit dodgy.
     */
    appendStringInfoString(&local_dsn,
                           " options='-c bdr.do_not_replicate=on -c bdr.permit_unsafe_ddl_commands=on -c bdr.skip_ddl_replication=on -c bdr.skip_ddl_locking=on'");
@@ -628,8 +310,8 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
 
        ereport(LOG,
                (errmsg("Creating replica with: %s --snapshot %s --source \"%s\" --target \"%s\" --tmp-directory \"%s\", --pg-dump-path \"%s\", --pg-restore-path \"%s\"",
-                       bdr_init_replica_script_path, snapshot, cfg->dsn,
-                       cfg->replica_local_dsn, tmpdir,
+                       bdr_init_replica_script_path, snapshot,
+                       node->init_from_dsn, node->local_dsn, tmpdir,
                        bdr_dump_path, bdr_restore_path)));
 
        n = execv(bdr_init_replica_script_path, argv);
@@ -699,376 +381,717 @@ bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
 #endif
 }
 
+/*
+ * BDR state synchronization.
+ */
 static void
-bdr_init_replica_conn_close(int code, Datum connptr)
+bdr_sync_nodes(PGconn *remote_conn, BDRNodeInfo *local_node)
 {
-   PGconn **conn_p;
-   PGconn *conn;
+   PGconn *local_conn;
 
-   conn_p = (PGconn**) DatumGetPointer(connptr);
-   Assert(conn_p != NULL);
-   conn = *conn_p;
+   local_conn = bdr_connect_nonrepl(local_node->local_dsn, "init");
 
-   if (conn == NULL)
-       return;
-   if (PQstatus(conn) != CONNECTION_OK)
-       return;
-   PQfinish(conn);
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+   {
+       StringInfoData query;
+       PGresult   *res;
+       char        sysid_str[33];
+       const char *const setup_query =
+           "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+           "SET LOCAL search_path = bdr, pg_catalog;\n"
+           "SET LOCAL bdr.permit_unsafe_ddl_commands = on;\n"
+           "SET LOCAL bdr.skip_ddl_replication = on;\n"
+           "SET LOCAL bdr.skip_ddl_locking = on;\n"
+           "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+           "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+       /* Setup the environment. */
+       res = PQexec(remote_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       res = PQexec(local_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on local failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       /* Copy remote bdr_nodes entries to the local node. */
+       bdr_copytable(remote_conn, local_conn,
+                     "COPY (SELECT * FROM bdr.bdr_nodes) TO stdout",
+                     "COPY bdr.bdr_nodes FROM stdin");
+
+       /* Copy the local entry to remote node. */
+       initStringInfo(&query);
+       /* No need to quote as everything is numbers. */
+       snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, local_node->sysid);
+       sysid_str[sizeof(sysid_str)-1] = '\0';
+       appendStringInfo(&query,
+                        "COPY (SELECT * FROM bdr.bdr_nodes WHERE "
+                           "node_sysid = '%s' AND node_timeline = '%u' "
+                           "AND node_dboid = '%u') TO stdout",
+                        sysid_str, local_node->timeline, local_node->dboid);
+
+       bdr_copytable(local_conn, remote_conn,
+                     query.data, "COPY bdr.bdr_nodes FROM stdin");
+
+       /*
+        * Copy remote connections to the local node.
+        *
+        * Adding local connection to remote node is handled separately
+        * because it triggers the connect-back process on the remote node(s).
+        */
+       bdr_copytable(remote_conn, local_conn,
+                     "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+                     "COPY bdr.bdr_connections FROM stdin");
+
+       /* Save changes. */
+       res = PQexec(remote_conn, "COMMIT");
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "COMMIT on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+
+       res = PQexec(local_conn, "COMMIT");
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "COMMIT on remote failed: %s",
+                   PQresultErrorMessage(res));
+       PQclear(res);
+   }
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&local_conn));
+   PQfinish(local_conn);
+}
+
+static void
+bdr_insert_remote_conninfo(PGconn *conn, BdrConnectionConfig *myconfig)
+{
+#define INTERNAL_NODE_JOIN_NPARAMS 6
+   PGresult   *res;
+   Oid         types[INTERNAL_NODE_JOIN_NPARAMS] = { TEXTOID, OIDOID, OIDOID, TEXTOID, INT4OID, TEXTARRAYOID };
+   const char *values[INTERNAL_NODE_JOIN_NPARAMS];
+   StringInfoData      replicationsets;
+
+   /* Needs to fit max length of UINT64_FORMAT */
+   char                sysid_str[33];
+   char                tlid_str[33];
+   char                mydatabaseid_str[33];
+   char                apply_delay[33];
+
+   initStringInfo(&replicationsets);
+
+   stringify_my_node_identity(sysid_str, sizeof(sysid_str),
+                              tlid_str, sizeof(tlid_str),
+                              mydatabaseid_str, sizeof(mydatabaseid_str));
+
+   values[0] = &sysid_str[0];
+   values[1] = &tlid_str[0];
+   values[2] = &mydatabaseid_str[0];
+   values[3] = myconfig->dsn;
+
+   snprintf(&apply_delay[0], 33, "%d", myconfig->apply_delay);
+   values[4] = &apply_delay[0];
+   /*
+    * Replication sets are stored as a quoted identifier list. To turn
+    * it into an array literal we can just wrap some brackets around it.
+    */
+   appendStringInfo(&replicationsets, "{%s}", myconfig->replication_sets);
+   values[5] = replicationsets.data;
+
+   res = PQexecParams(conn,
+                      "SELECT bdr.internal_node_join($1,$2,$3,$4,$5,$6);",
+                      INTERNAL_NODE_JOIN_NPARAMS,
+                      types, &values[0], NULL, NULL, 0);
+
+   /*
+    * bdr.internal_node_join() must correctly handle unique violations.
+    * Otherwise init that resumes after slot creation, when we're waiting
+    * for inbound slots, will fail.
+    */
+   if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       elog(ERROR, "unable to update remote bdr.bdr_connections: %s",
+                   PQerrorMessage(conn));
+
+#undef INTERNAL_NODE_JOIN_NPARAMS
 }
 
 /*
- * Determine whether we need to initialize the database from a remote
- * node and perform the required initialization if so.
+ * Find all connections other than our own using the copy of
+ * bdr.bdr_connections that we acquired from the remote server during
+ * apply. Apply workers won't be started yet, we're just making the
+ * slots.
+ *
+ * If the slot already exists from a prior attempt we'll leave it
+ * alone. It'll be advanced when we start replaying from it anyway,
+ * and it's guaranteed to retain more than the WAL we need.
  */
-void
-bdr_init_replica(Name dbname)
+static void
+bdr_init_make_other_slots()
 {
-   char status;
-   XLogRecPtr min_remote_lsn;
-   PGconn *nonrepl_init_conn;
-   StringInfoData dsn;
-   BdrWorker  *init_replica_worker;
-   BdrConnectionConfig *init_replica_config;
-   int spi_ret;
+   List       *configs;
+   ListCell   *lc;
+   MemoryContext old_context;
 
-   initStringInfo(&dsn);
+   Assert(!IsTransactionState());
+   StartTransactionCommand();
+   old_context = MemoryContextSwitchTo(TopMemoryContext);
+   configs = bdr_read_connection_configs();
+   MemoryContextSwitchTo(old_context);
+   CommitTransactionCommand();
+
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = lfirst(lc);
+       PGconn *conn;
+       NameData slot_name;
+       uint64 sysid;
+       TimeLineID timeline;
+       Oid dboid;
+       RepNodeId replication_identifier;
+       char *snapshot;
+
+       if (cfg->sysid == GetSystemIdentifier() &&
+           cfg->timeline == ThisTimeLineID &&
+           cfg->dboid == MyDatabaseId)
+       {
+           /* Don't make a slot pointing to ourselves */
+           continue;
+           bdr_free_connection_config(cfg);
+       }
+
+       conn = bdr_establish_connection_and_slot(cfg->dsn, "mkslot", &slot_name,
+               &sysid, &timeline, &dboid, &replication_identifier,
+               &snapshot);
+
+       /* Ensure the slot points to the node the conn info says it should */
+       if (cfg->sysid != sysid ||
+           cfg->timeline != timeline ||
+           cfg->dboid != dboid)
+       {
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("System identification mismatch between connection and slot"),
+                    errdetail("Connection for "BDR_LOCALID_FORMAT" resulted in slot on node "BDR_LOCALID_FORMAT" instead of expected node",
+                              cfg->sysid, cfg->timeline, cfg->dboid, EMPTY_REPLICATION_NAME,
+                              sysid, timeline, dboid, EMPTY_REPLICATION_NAME)));
+       }
+
+       /* We don't require the snapshot IDs here */
+       if (snapshot != NULL)
+           pfree(snapshot);
+
+       /* No replication for now, just close the connection */
+       PQfinish(conn);
 
-   elog(DEBUG2, "bdr %s: bdr_init_replica",
-        NameStr(*dbname));
+       elog(DEBUG2, "Ensured existence of slot %s on "BDR_LOCALID_FORMAT,
+                    NameStr(slot_name), cfg->sysid, cfg->timeline, cfg->dboid,
+                    EMPTY_REPLICATION_NAME);
+
+       bdr_free_connection_config(cfg);
+   }
+
+   list_free(configs);
+}
+
+/*
+ * For each outbound connection in bdr.bdr_connections we should have a local
+ * replication slot created by a remote node using our connection info.
+ *
+ * Wait until all such entries are created and active, then return.
+ */
+static void
+bdr_init_wait_for_slot_creation()
+{
+   List       *configs;
+   ListCell   *lc;
+   Name*       slot_names;
+   Size        n_slots;
+   int         tup_idx, arr_idx;
+
+   elog(INFO, "waiting for all inbound slots to be established");
 
    /*
-    * The local SPI transaction we're about to perform must do any writes as a
-    * local transaction, not as a changeset application from a remote node.
-    * That allows rows to be repliated to other nodes. So no replication_origin_id
-    * may be set.
+    * Determine the list of expected slot identifiers. These are
+    * inbound slots, so they're our db oid + the remote's bdr ident.
     */
-   Assert(replication_origin_id == InvalidRepNodeId);
+   StartTransactionCommand();
+   configs = bdr_read_connection_configs();
+
+   slot_names = (Name*)palloc0(sizeof(Name) * list_length(configs));
+
+   n_slots = 0;
+   foreach(lc, configs)
+   {
+       BdrConnectionConfig *cfg = lfirst(lc);
+       Name slot_name;
+
+       if (cfg->sysid == GetSystemIdentifier() &&
+           cfg->timeline == ThisTimeLineID &&
+           cfg->dboid == MyDatabaseId)
+       {
+           /* We won't see an inbound slot from our own node */
+           continue;
+       }
+
+       /* There's no corresponding incoming slot for a unidirectional slot */
+       if (cfg->is_unidirectional)
+           continue;
+
+       slot_name = (NameData*) palloc0(sizeof(NameData));
+       bdr_slot_name(slot_name, cfg->sysid, cfg->timeline, cfg->dboid,
+                     MyDatabaseId);
+
+       elog(DEBUG2, "expecting inbound slot named %s", NameStr(*slot_name));
+
+       slot_names[n_slots++] = slot_name;
+   }
 
    /*
-    * Check the local bdr.bdr_nodes over SPI or direct scan to see if
-    * there's an entry for ourselves in ready mode already.
+    * Wait for each to be created. There's no useful way to be notified when a
+    * slot gets created, so just scan all slots to see if all the ones we want
+    * are present and active. If not, sleep and retry soon.
     *
-    * Note that we don't have to explicitly SPI_finish(...) on error paths;
-    * that's taken care of for us.
+    * This is a very inefficient approach but for the number of slots we're
+    * interested in it doesn't matter.
     */
-   StartTransactionCommand();
-   spi_ret = SPI_connect();
-   if (spi_ret != SPI_OK_CONNECT)
-       elog(ERROR, "SPI already connected; this shouldn't be possible");
+   SPI_connect();
 
-   status = bdr_nodes_get_local_status(GetSystemIdentifier(), ThisTimeLineID,
-                                       MyDatabaseId);
-   if (status == 'r')
+   while (true)
    {
-       /* Already in ready state, nothing more to do */
-       elog(DEBUG2, "init_replica: Already inited");
-       SPI_finish();
-       CommitTransactionCommand();
-       return;
+       Datum   values[1] = {MyDatabaseId};
+       Oid     types[1] = {OIDOID};
+       Size    n_slots_found = 0;
+
+       SPI_execute_with_args("select slot_name "
+                             "from pg_catalog.pg_replication_slots "
+                             "where plugin = '"BDR_LIBRARY_NAME"' "
+                             "and slot_type = 'logical' "
+                             "and datoid = $1 and active",
+                             1, types, values, NULL, false, 0);
+
+       for (tup_idx = 0; tup_idx < SPI_processed; tup_idx++)
+       {
+           char       *slot_name;
+
+           slot_name = SPI_getvalue(SPI_tuptable->vals[tup_idx],
+                                    SPI_tuptable->tupdesc,
+                                    1);
+
+           Assert(slot_name != NULL);
+
+           /*
+            * Does this slot appear in the array of expected slots and if so,
+            * have we seen it already?
+            *
+            * This is O(m*n) for m existing slots and n expected slots, but
+            * really, for this many slots, who cares.
+            */
+           for (arr_idx = 0; arr_idx < n_slots; arr_idx++)
+           {
+               if ( strcmp(NameStr(*slot_names[arr_idx]), slot_name) == 0 )
+               {
+                   n_slots_found++;
+                   break;
+               }
+           }
+       }
+
+       if (n_slots_found == n_slots)
+           break;
+
+       elog(DEBUG2, "found %u of %u expected slots, sleeping",
+            (uint32)n_slots_found, (uint32)n_slots);
+
+       pg_usleep(100000);
    }
 
+   SPI_finish();
+
+   CommitTransactionCommand();
+
+   elog(INFO, "all inbound slots established");
+
    /*
-    * Before starting workers we must determine if we need to copy
-    * initial state from a remote node. This is only necessary if
-    * there is a connection with init_replica set and we do not yet
-    * have an entry in the local "bdr.bdr_nodes" table for our node
-    * ID showing initialisation to be complete.
+    * Should this also check all outbound workers are connected? Doing so
+    * isn't simple - checking for replication identifiers doesn't confirm that
+    * the connection is active. We'd need to talk to the apply workers or try
+    * to convey information via pg_stat_activity.
     */
-   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-   init_replica_worker = find_init_replica_worker(dbname);
-   LWLockRelease(BdrWorkerCtl->lock);
-   if (!init_replica_worker)
+}
+
+/*
+ * TODO DYNCONF perform_pointless_transaction
+ *
+ * This is temporary code to be removed when the full part/join protocol is
+ * introduced, at which point WAL messages should handle this. See comments on
+ * call site.
+ */
+static void
+perform_pointless_transaction(PGconn *conn, BDRNodeInfo *node)
+{
+   PGresult   *res;
+
+   res = PQexec(conn, "CREATE TEMP TABLE bdr_init(a int) ON COMMIT DROP");
+   Assert(PQresultStatus(res) == PGRES_COMMAND_OK);
+   PQclear(res);
+}
+
+/*
+ * Initialize the database, from a remote node if necessary.
+ */
+void
+bdr_init_replica(BDRNodeInfo *local_node)
+{
+   char                status;
+   PGconn             *nonrepl_init_conn;
+   StringInfoData      dsn;
+   BdrConnectionConfig *local_conn_config;
+
+   initStringInfo(&dsn);
+
+   status = local_node->status;
+
+   Assert(status != 'r');
+
+   elog(DEBUG2, "bdr_init_replica");
+
+   /*
+    * The local SPI transaction we're about to perform must do any writes as a
+    * local transaction, not as a changeset application from a remote node.
+    * That allows rows to be replicated to other nodes. So no replication_origin_id
+    * may be set.
+    */
+   Assert(replication_origin_id == InvalidRepNodeId);
+
+   /*
+    * Before starting workers we must determine if we need to copy initial
+    * state from a remote node. This is necessary unless we are the first node
+    * created or we've already completed init. If we'd already completed init
+    * we would've exited above.
+    */
+   if (local_node->init_from_dsn == NULL)
    {
-       if (status != '\0')
+       if (status != 'b')
        {
            /*
             * Even though there's no init_replica worker, the local bdr.bdr_nodes table
             * has an entry for our (sysid,dbname) and it isn't status=r (checked above),
-            * we must've had an init_replica configured before, then removed.
+            * this should never happen
             */
-           ereport(ERROR, (errmsg("bdr.bdr_nodes row with (sysid="
-                   UINT64_FORMAT ", dbname=%s) exists and has status=%c, but "
-                   "no connection with init_replica=t is configured for this "
-                   "database. ",
-                   GetSystemIdentifier(), NameStr(*dbname), status),
-                   errdetail("You probably configured initial setup with "
-                   "init_replica on a connection, then removed or changed that "
-                   "connection before setup completed properly. "),
-                   errhint("DROP and re-create the database if it has no "
-                   "existing content of value, or add the init_replica setting "
-                   "to one of the connections.")));
+           ereport(ERROR, (errmsg("bdr.bdr_nodes row with "BDR_LOCALID_FORMAT" exists and has status=%c, "
+                                  "but has init_from_dsn set to NULL",
+                   GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, EMPTY_REPLICATION_NAME, status)));
        }
        /*
         * No connections have init_replica=t, so there's no remote copy to do.
         * We still have to ensure that bdr.bdr_nodes.status is 'r' for this
         * node so that slot creation is permitted.
+        *
+        * XXX: is this actually a good idea?
         */
        elog(DEBUG2, "init_replica: Marking as root/standalone node");
        bdr_nodes_set_local_status('r');
-   }
-   /*
-    * We no longer require the transaction for SPI; further work gets done on
-    * the remote machine's bdr.bdr_nodes table and replicated back to us via
-    * pg_dump/pg_restore, or over the walsender protocol once we start
-    * replay. If we aren't just about to exit anyway.
-    */
-   SPI_finish();
-   CommitTransactionCommand();
 
-   if (!init_replica_worker)
-       /* Cleanup done and nothing more to do */
        return;
+   }
 
-   init_replica_config = bdr_connection_configs
-       [init_replica_worker->data.apply.connection_config_idx];
-   elog(LOG, "bdr %s: bdr_init_replica init from connection %s",
-        NameStr(*dbname), init_replica_config->name);
+   local_conn_config = bdr_get_connection_config(
+           local_node->sysid,
+           local_node->timeline,
+           local_node->dboid,
+           true);
 
-   resetStringInfo(&dsn);
-   appendStringInfo(&dsn,
-                    "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica setup'",
-                    init_replica_config->dsn, BDR_LOCALID_FORMAT_ARGS,
-                    init_replica_config->name);
+   elog(DEBUG1, "init_replica init from remote %s",
+        local_node->init_from_dsn);
 
-   /*
-    * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
-    * system identifier. If there is, that'll tell us what stage of startup
-    * we are up to and let us resume an incomplete start.
-    */
-   nonrepl_init_conn = PQconnectdb(dsn.data);
-   if (PQstatus(nonrepl_init_conn) != CONNECTION_OK)
-   {
-       ereport(FATAL,
-               (errmsg("bdr %s: could not connect to the upstream server in non-replication mode: %s",
-                       NameStr(*dbname),
-                       PQerrorMessage(nonrepl_init_conn))));
-   }
+   nonrepl_init_conn =
+       bdr_connect_nonrepl(local_node->init_from_dsn, "init");
 
-   PG_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
-           PointerGetDatum(&nonrepl_init_conn));
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&nonrepl_init_conn));
    {
        bdr_ensure_ext_installed(nonrepl_init_conn);
 
-       /* Get the bdr.bdr_nodes status field for our node id from the remote */
-       status = bdr_get_remote_status(nonrepl_init_conn);
-
-       if (bdr_init_from_basedump)
-       {
-           status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
-       }
-       else
+       switch (status)
        {
-           switch (status)
-           {
-               case '\0':
-                   elog(DEBUG2, "bdr %s: initializing from clean state",
-                        NameStr(*dbname));
-                   break;
+           case 'b':
+               elog(DEBUG2, "initializing from clean state");
+               break;
 
-               case 'r':
-                   /*
-                    * Init has been completed, but we didn't check our local
-                    * bdr.bdr_nodes, or the final update hasn't propagated yet.
-                    *
-                    * All we need to do is catch up, we already replayed enough to be
-                    * consistent and start up in normal mode last time around
-                    */
-                   elog(DEBUG2, "bdr %s: init already completed, nothing to do",
-                        NameStr(*dbname));
-                   return;
-
-               case 'c':
-                   /*
-                    * We were in catchup mode when we died. We need to resume catchup
-                    * mode up to the expected LSN before switching over.
-                    *
-                    * To do that all we need to do is fall through without doing any
-                    * slot re-creation, dump/apply, etc, and pick up when we do
-                    * catchup.
-                    *
-                    * We won't know what the original catchup target point is, but we
-                    * can just catch up to whatever xlog position the server is
-                    * currently at.
-                    */
-                   elog(DEBUG2, "bdr %s: dump applied, need to continue catchup",
-                        NameStr(*dbname));
-                   break;
+           case 'r':
+               elog(ERROR, "unexpected state");
 
-               case 'i':
-                   /*
-                    * A previous init attempt seems to have failed. Clean up, then
-                    * fall through to start setup again.
-                    *
-                    * We can't just re-use the slot and replication identifier that
-                    * were created last time (if they were), because we have no way
-                    * of getting the slot's exported snapshot after
-                    * CREATE_REPLICATION_SLOT.
-                    */
-                   elog(DEBUG2, "bdr %s: previous failed initalization detected, cleaning up",
-                        NameStr(*dbname));
-                   bdr_drop_slot_and_replication_identifier(init_replica_config);
-                   status = bdr_set_remote_status(nonrepl_init_conn, '\0', status);
-                   break;
+           case 'c':
+               /*
+                * We were in catchup mode when we died. We need to resume catchup
+                * mode up to the expected LSN before switching over.
+                *
+                * To do that all we need to do is fall through without doing any
+                * slot re-creation, dump/apply, etc, and pick up where we do
+                * catchup.
+                *
+                * We won't know what the original catchup target point is, but we
+                * can just catch up to whatever xlog position the server is
+                * currently at, it's guaranteed to be later than the target
+                * position.
+                */
+               elog(DEBUG2, "dump applied, need to continue catchup");
+               break;
 
-               default:
-                   elog(ERROR, "unreachable"); /* Unhandled case */
-                   break;
-           }
+           case 'o':
+               elog(DEBUG2, "dump applied and catchup completed, need to continue slot creation");
+               break;
+
+           case 'i':
+               /*
+                * A previous init attempt seems to have failed.
+                * Clean up, then fall through to start setup
+                * again.
+                *
+                * We can't just re-use the slot and replication
+                * identifier that were created last time (if
+                * they were), because we have no way of getting
+                * the slot's exported snapshot after
+                * CREATE_REPLICATION_SLOT.
+                *
+                * We could drop and re-create the slot, but...
+                *
+                * We also have no way to undo a failed
+                * pg_restore, so if that phase fails it's
+                * necessary to do manual cleanup, dropping and
+                * re-creating the db.
+                *
+                * To avoid that We need to be able to run
+                * pg_restore --clean, and that needs a way to
+                * exclude the bdr schema, the bdr extension,
+                * and their dependencies like plpgsql and
+                * btree_gist. (TODO patch pg_restore for that)
+                */
+               ereport(ERROR,
+                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                        errmsg("previous init failed, manual cleanup is required"),
+                        errdetail("Found bdr.bdr_nodes entry for "BDR_LOCALID_FORMAT" with state=i in remote bdr.bdr_nodes", BDR_LOCALID_FORMAT_ARGS),
+                        errhint("Remove all replication identifiers and slots corresponding to this node from the init target node then drop and recreate this database and try again")));
+               break;
+
+           default:
+               elog(ERROR, "unreachable %c", status); /* Unhandled case */
+               break;
        }
 
-       if (status == '\0')
+       if (status == 'b')
        {
-           int         off;
-           int        *my_conn_idxs;
-           int         n_conns = 0;
            char       *init_snapshot = NULL;
            PGconn     *init_repl_conn = NULL;
+           NameData    slot_name;
+           uint64      remote_sysid;
+           TimeLineID  remote_timeline;
+           Oid         remote_dboid;
+           RepNodeId   repnodeid;
 
-           elog(LOG, "bdr %s: initializing from remote db", NameStr(*dbname));
+           elog(INFO, "initializing node");
 
            /*
             * We're starting from scratch or have cleaned up a previous failed
             * attempt.
             */
-           status = bdr_set_remote_status(nonrepl_init_conn, 'i', status);
+           status = 'i';
+           bdr_nodes_set_local_status(status);
 
            /*
-            * A list of all connections to make slots for, as indexes into
-            * BdrWorkerCtl.
+            * This is unidirectional subscribe, let the other node know that
+            * it should behave as BDR node (as it might be UDR node which does
+            * not require init).
             */
-           my_conn_idxs = (int*)palloc(sizeof(Size) * bdr_max_workers);
+           if (local_conn_config == NULL)
+               bdr_remote_activate(nonrepl_init_conn);
 
-           /* Collect a list of connections to make slots for. */
-           LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-           for (off = 0; off < bdr_max_workers; off++)
-           {
-               BdrWorker              *worker = &BdrWorkerCtl->slots[off];
-
-               if (worker->worker_type == BDR_WORKER_APPLY)
-               {
-                   BdrConnectionConfig * const cfg = bdr_connection_configs
-                       [worker->data.apply.connection_config_idx];
+           /*
+            * Now establish our slot on the target node, so we can replay
+            * changes from that node. It'll be used in catchup mode.
+            */
+           init_repl_conn = bdr_establish_connection_and_slot(
+                               local_node->init_from_dsn,
+                               "init", &slot_name,
+                               &remote_sysid, &remote_timeline, &remote_dboid,
+                               &repnodeid, &init_snapshot);
 
-                   if (strcmp(cfg->dbname, NameStr(*dbname)) == 0)
-                       my_conn_idxs[n_conns++] = off;
-               }
-           }
-           LWLockRelease(BdrWorkerCtl->lock);
+           elog(INFO, "connected to target node "BDR_LOCALID_FORMAT
+                " with snapshot %s",
+                remote_sysid, remote_timeline, remote_dboid,
+                EMPTY_REPLICATION_NAME, init_snapshot);
 
-           elog(DEBUG2, "bdr %s: creating slots for %d nodes",
-                NameStr(*dbname), n_conns);
+           /*
+            * Take the remote dump and apply it. This will give us a local
+            * copy of bdr_connections to work from. It's guaranteed that
+            * everything after this dump will be accessible via the catchup
+            * mode slot created earlier.
+            */
+           bdr_init_exec_dump_restore(local_node, init_snapshot);
 
            /*
-            * For each connection, ensure its slot exists.
+            * TODO DYNCONF copy replication identifier state
+            *
+            * Should copy the target node's pg_catalog.pg_replication_identifier
+            * state for each node to the local node, using the same snapshot
+            * we used to take the dump from the remote. Doing this ensures
+            * that when we create slots to the target nodes they'll begin
+            * replay from a position that's exactly consistent with what's
+            * in the dump.
             *
-            * Do it one by one rather than fiddling with async libpq queries. If
-            * this needs to be parallelized later, it should probably be done by
-            * launching each apply worker and letting them create their own
-            * slots, then having them wait until signalled/unlatched before
-            * proceeding with actual replication. That'll save us another round
-            * of connections too.
+            * We'll still need catchup mode because there's no guarantee our
+            * newly created slots will force all WAL we'd need to be retained
+            * on each node. The target might be behind. So we should catchup
+            * replay until the replication identifier positions received from
+            * catchup are >= the creation positions of the slots we made.
             *
-            * We don't attempt any cleanup if slot creation fails, we just bail out
-            * and leave any already-created slots in place.
+            * (We don't need to do this if we instead send a replay confirmation
+            * request and wait for a reply from each node.)
             */
-           for (off = 0; off < n_conns; off++)
-           {
-               BdrWorker *w = &BdrWorkerCtl->slots[my_conn_idxs[off]];
-               BdrConnectionConfig *cfg;
-               char *snapshot = NULL;
-               PGconn *conn = NULL;
-               RepNodeId replication_identifier;
-               NameData slot_name;
-               uint64 sysid;
-               Oid dboid;
-               TimeLineID timeline;
-
-               cfg = bdr_connection_configs
-                   [w->data.apply.connection_config_idx];
-
-               ereport(LOG,
-                       (errmsg("bdr %s: checking/creating slot for %s at %s",
-                               NameStr(*dbname), cfg->name, cfg->dsn)));
-               /*
-                * Create the slot on the remote. The returned remote sysid and
-                * timeline, the slot name, and the local replication identifier
-                * are all discarded; they're not needed here, and will be obtained
-                * again by the apply workers when they're launched after init.
-                */
-               conn = bdr_establish_connection_and_slot(cfg->dsn, "slot",
-                   &slot_name, &sysid, &timeline, &dboid, &replication_identifier,
-                   &snapshot);
 
-               /* Always throws rather than returning failure */
-               Assert(conn);
+           PQfinish(init_repl_conn);
+           pfree(init_snapshot);
 
-               if (w == init_replica_worker)
-               {
-                   /*
-                    * We need to keep the snapshot ID returned by CREATE SLOT so
-                    * we can pass it to pg_dump to get a consistent dump from the
-                    * remote slot's start point.
-                    *
-                    * The snapshot is only valid for the lifetime of the
-                    * replication connection we created it with, so we must keep
-                    * that connection around until the dump finishes.
-                    */
-                   if (!snapshot)
-                       elog(ERROR, "bdr %s: init_replica failed to create snapshot!",
-                            NameStr(*dbname));
-                   init_snapshot = snapshot;
-                   init_repl_conn = conn;
-               }
-               else
-               {
-                   /*
-                    * Just throw the returned info away; we only needed to create
-                    * the slot so its replication identifier can be advanced
-                    * during catchup.
-                    */
-                   if (snapshot)
-                       pfree(snapshot);
-                   PQfinish(conn);
-               }
+           /*
+            * This is group join, copy the state (bdr_nodes and
+            * bdr_connections) over from the init node to our node.
+            */
+           if (local_conn_config != NULL)
+           {
+               elog(DEBUG1, "syncing bdr_nodes and bdr_connections");
+               bdr_sync_nodes(nonrepl_init_conn, local_node);
            }
 
-           pfree(my_conn_idxs);
+           status = 'c';
+           bdr_nodes_set_local_status(status);
+           elog(DEBUG1, "dump and apply finished, preparing for catchup replay");
+       }
+
+       Assert(status != 'b');
+
+       if (status == 'c')
+       {
+           XLogRecPtr min_remote_lsn;
+           remote_node_info ri;
+
+           /*
+            * Launch outbound connections to all other nodes. It doesn't
+            * matter that their slot horizons are after the dump was taken on
+            * the origin node, so we could never replay all the data we need
+            * if we switched to replaying from these slots now.  We'll be
+            * advancing them in catchup mode until they overtake their current
+            * position before switching to replaying from them directly.
+            */
+           bdr_init_make_other_slots();
 
-           /* If we get here, we should have a valid snapshot to dump */
-           Assert(init_snapshot != NULL);
-           Assert(init_repl_conn != NULL);
+           /*
+            * Enter catchup mode and wait until we've replayed up to the LSN
+            * the remote was at when we started catchup.
+            *
+            * TODO: It's possible that this step can lose transactions that
+            * were committed on a 3rd party node before we made our slot on it
+            * but not replicated to the init target node until after we exit
+            * catchup mode. If we acquire the DDL lock during join we can know
+            * that can't happen, so we should do that.
+            */
+           elog(DEBUG3, "getting LSN to replay to in catchup mode");
+           min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
 
            /*
-            * Execute the dump and apply its self.
+            * Catchup cannot complete if there isn't at least one remote transaction
+            * to replay. So we perform a dummy transaction on the target node.
             *
-            * Note that the bdr extension tables override pg_dump's default and
-            * ask to be included in dumps. In particular, bdr.bdr_nodes will get
-            * copied over.
+            * XXX This is a hack. What we really *should* be doing is asking
+            * the target node to send a catchup confirmation wal message, then
+            * wait until all its current peers (we aren' one yet) reply with
+            * confirmation. Then we should be replaying until we get
+            * confirmation of this from the init target node, rather than
+            * replaying to some specific LSN. The full part/join
+            * protocol should take care of this.
             */
-           elog(DEBUG1, "bdr %s: creating and restoring dump for %s",
-                NameStr(*dbname), init_replica_config->name);
-           bdr_exec_init_replica(init_replica_config, init_snapshot);
-           PQfinish(init_repl_conn);
+           elog(DEBUG3, "forcing a new transaction on the target node");
+           perform_pointless_transaction(nonrepl_init_conn, local_node);
 
-           pfree(init_snapshot);
-           status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
+           bdr_get_remote_nodeinfo_internal(nonrepl_init_conn, &ri);
+
+           /* Launch the catchup worker and wait for it to finish */
+           elog(DEBUG1, "launching catchup mode apply worker");
+           bdr_catchup_to_lsn(&ri, min_remote_lsn);
+
+           free_remote_node_info(&ri);
+
+           /*
+            * We're done with catchup. The next phase is inserting our
+            * conninfo, so set status=o
+            */
+           status = 'o';
+           bdr_nodes_set_local_status(status);
+           elog(DEBUG1, "catchup worker finished, requesting slot creation");
        }
 
-       Assert(status == 'c');
+       /* To reach here we must be waiting for slot creation */
+       Assert(status == 'o');
+
+       /*
+        * It is now safe to start apply workers, as we've finished catchup.
+        * Doing so ensures that we will replay our own bdr.bdr_nodes changes
+        * from the target node and also makes sure we stay more up-to-date,
+        * reducing slot lag on other nodes.
+        */
+       bdr_launch_apply_workers(MyDatabaseId);
 
-       /* Launch the catchup worker and wait for it to finish */
-       elog(DEBUG1, "bdr %s: launching catchup mode apply worker", NameStr(*dbname));
-       min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
-       bdr_catchup_to_lsn(
-           init_replica_worker->data.apply.connection_config_idx,
-           min_remote_lsn);
-       status = bdr_set_remote_status(nonrepl_init_conn, 'r', status);
+       /*
+        * Insert our connection info on the remote end. This will prompt
+        * the other end to connect back to us and make a slot, and will
+        * cause the other nodes to do the same when they receive the new
+        * row.
+        *
+        * It makes no sense to do this with UDR, where the peer doesn't
+        * connect back to us.
+        */
+       if (local_conn_config != NULL)
+       {
+           elog(DEBUG1, "inserting our connection into into remote end");
+           bdr_insert_remote_conninfo(nonrepl_init_conn, local_conn_config);
+       }
+
+       /*
+        * Wait for all outbound and inbound slot creation to be complete.
+        *
+        * The inbound slots aren't yet required to relay local writes to
+        * remote nodes, but they'll be used to write our catchup
+        * confirmation request WAL message, so we need them to exist.
+        *
+        * This makes no sense on UDR, where the init target doesn't
+        * connect back to us and no other inbound or outbound connections
+        * exist. It still gets run, but we won't find any inbound
+        * slots to look for.
+        */
+       elog(DEBUG1, "waiting for all inbound slots to be created");
+       bdr_init_wait_for_slot_creation();
 
-       elog(INFO, "bdr %s: catchup worker finished, ready for normal replication",
-            NameStr(*dbname));
+       /*
+        * We now have inbound and outbound slots for all nodes, and
+        * we're caught up to a reasonably recent state from the target
+        * node thanks to the dump and catchup mode operation.
+        *
+        * Set the node state to 'r'eady and allow writes.
+        *
+        * TODO: Before we can really be sure we're ready we should be
+        * sending a replay confirmation request and waiting for all
+        * nodes to reply, so we know we have full communication.
+        */
+       status = 'r';
+       bdr_nodes_set_local_status(status);
+       elog(INFO, "finished init_replica, ready to enter normal replication");
    }
-   PG_END_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
-           PointerGetDatum(&nonrepl_init_conn));
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&nonrepl_init_conn));
+
+   Assert(status == 'r');
 
    PQfinish(nonrepl_init_conn);
 }
@@ -1103,32 +1126,27 @@ bdr_catchup_to_lsn_cleanup(int code, Datum offset)
  * When we finish applying and the worker exits, we'll be caught up with the
  * remote and in a consistent state where all our local replication identifiers
  * are consistent with the actual state of the local DB.
- *
- * Arguments:
- *
- * cfg_index: Index of the bdr connection for this dbname with init_worker=t
- * set within bdr_connection_configs. Used to start the worker.
- *
- * target_lsn: LSN of immediate origin node at which catchup should stop.
  */
 static void
-bdr_catchup_to_lsn(int cfg_index,
-                  XLogRecPtr target_lsn)
+bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn)
 {
    uint32 worker_shmem_idx;
    BdrWorker *worker;
-   BdrConnectionConfig *cfg;
-
-   cfg = bdr_connection_configs[cfg_index];
-   Assert(cfg != NULL);
-   Assert(cfg->init_replica);
+   BdrApplyWorker *catchup_worker;
 
-   elog(DEBUG1, "Registering bdr apply catchup worker %s for db %s to lsn %X/%X",
-        cfg->name, cfg->dbname,
+   elog(DEBUG1, "Registering bdr apply catchup worker for "BDR_LOCALID_FORMAT" to lsn %X/%X",
+        ri->sysid, ri->timeline, ri->dboid, EMPTY_REPLICATION_NAME,
         (uint32)(target_lsn>>32), (uint32)target_lsn);
 
    /* Create the shmem entry for the catchup worker */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
    worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &worker_shmem_idx);
+   catchup_worker = &worker->data.apply;
+   catchup_worker->dboid = MyDatabaseId;
+   catchup_worker->remote_sysid = ri->sysid;
+   catchup_worker->remote_timeline = ri->timeline;
+   catchup_worker->remote_dboid = ri->dboid;
+   LWLockRelease(BdrWorkerCtl->lock);
 
    /*
     * Launch the catchup worker, ensuring that we free the shmem slot for the
@@ -1146,10 +1164,7 @@ bdr_catchup_to_lsn(int cfg_index,
        BackgroundWorkerHandle *bgw_handle;
        pid_t bgw_pid;
        pid_t prev_bgw_pid = 0;
-       BdrApplyWorker *catchup_worker = &worker->data.apply;
-
-       /* Make sure the catchup worker can find its bdr.xxx_ GUCs */
-       catchup_worker->connection_config_idx = cfg_index;
+       uint32 worker_arg;
 
        /* Special parameters for a catchup worker only */
        catchup_worker->replay_stop_lsn = target_lsn;
@@ -1164,14 +1179,16 @@ bdr_catchup_to_lsn(int cfg_index,
        strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
 
        bgw.bgw_restart_time = BGW_NEVER_RESTART;
+       Assert(MyProc->pid != 0);
        bgw.bgw_notify_pid = MyProc->pid;
-       bgw.bgw_main_arg = Int32GetDatum(worker_shmem_idx);
+
+       Assert(worker_shmem_idx <= UINT16_MAX);
+       worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_shmem_idx;
+       bgw.bgw_main_arg = Int32GetDatum(worker_arg);
 
        snprintf(bgw.bgw_name, BGW_MAXLEN,
-                "bdr %s: catchup apply to %X/%X on %s",
-                cfg->dbname,
-                (uint32)(target_lsn >> 32), (uint32)target_lsn,
-                cfg->name);
+                "bdr: catchup apply to %X/%X",
+                (uint32)(target_lsn >> 32), (uint32)target_lsn);
        bgw.bgw_name[BGW_MAXLEN-1] = '\0';
 
        /* Launch the catchup worker and wait for it to start */
@@ -1228,14 +1245,12 @@ bdr_catchup_to_lsn(int cfg_index,
        {
            /* Worker must've died before it finished */
            elog(ERROR,
-                "bdr %s: catchup worker exited before catching up to target LSN %X/%X",
-                cfg->dbname,
+                "catchup worker exited before catching up to target LSN %X/%X",
                 (uint32)(target_lsn>>32), (uint32)target_lsn);
        }
        else
        {
-           elog(DEBUG1, "bdr %s: catchup worker caught up to target LSN",
-                cfg->dbname);
+           elog(DEBUG1, "catchup worker caught up to target LSN");
        }
    }
    PG_END_ENSURE_ERROR_CLEANUP(bdr_catchup_to_lsn_cleanup,
index 9798731cd9f796da265dcf254478b7868a1c8069..8c529511990f290cfb07c7456e60f976d0537b8d 100644 (file)
 
 #include "lib/ilist.h"
 
+#define EMPTY_REPLICATION_NAME ""
 #define BDR_SLOT_NAME_FORMAT "bdr_%u_%s_%u_%u__%s"
 #define BDR_NODE_ID_FORMAT "bdr_"UINT64_FORMAT"_%u_%u_%u_%s"
 
-/* GUC storage for a configured BDR connection. */
+/* A configured BDR connection from bdr_connections */
 typedef struct BdrConnectionConfig
 {
-   char *dsn;
-   int   apply_delay;
-   bool  init_replica;
-   char *replica_local_dsn;
-   char *replication_sets;
+   uint64      sysid;
+   TimeLineID  timeline;
+   Oid         dboid;
 
    /*
-    * These aren't technically GUCs, but are per-connection config
-    * information obtained from the GUCs.
+    * If the origin_ id fields are set then they must refer to our node,
+    * otherwise we wouldn't load the configuration entry. So if origin_is_set
+    * is false the origin was zero, and if true the origin is the local node
+    * id.
     */
-   char *name;
-   char *dbname;
+   bool origin_is_my_id;
 
-   /* Connection config might be broken (blank dsn, etc) */
-   bool is_valid;
+   /*
+    * Is this connection unidirectional, or should we expect a reciprocal
+    * inbound connection and slot?
+    */
+   bool is_unidirectional;
+
+   char *dsn;
+
+   int   apply_delay;
+
+   /* Quoted identifier-list of replication sets */
+   char *replication_sets;
 } BdrConnectionConfig;
 
 typedef struct BdrFlushPosition
@@ -49,5 +59,15 @@ extern volatile sig_atomic_t got_SIGHUP;
 
 extern void bdr_error_nodeids_must_differ(uint64 sysid, TimeLineID timeline,
                                          Oid dboid);
+extern List* bdr_read_connection_configs(void);
+extern BdrConnectionConfig* bdr_get_connection_config(uint64 sysid,
+                                                     TimeLineID timeline,
+                                                     Oid dboid,
+                                                     bool missing_ok);
+
+extern void bdr_free_connection_config(BdrConnectionConfig *cfg);
+
+extern void bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+                         Oid dboid, Oid local_dboid);
 
 #endif   /* BDR_INTERNAL_H */
index bd2b7cb0781597fbe6a11cc8212c834c1c437e34..f29108ffe5454b34e798a44b15666388f532b739 100644 (file)
@@ -2,32 +2,11 @@ include = 'bdr_regress_common.conf'
 
 track_commit_timestamp = on
 
-bdr.connections = 'node1to2,node1to3,node2to3,node2to1,node3to1,node3to2'
-
-bdr.node1to2_dsn = 'dbname=node2'
-bdr.node1to2_local_dbname = 'node1'
-bdr.node1to3_dsn = 'dbname=node3'
-bdr.node1to3_local_dbname = 'node1'
-
-bdr.node2to1_dsn = 'dbname=node1'
-bdr.node2to1_local_dbname = 'node2'
-#bdr.node2to1_init_replica=on
-#bdr.node2to1_replica_local_dsn='dbname=node2'
-bdr.node2to3_dsn = 'dbname=node3'
-bdr.node2to3_local_dbname = 'node2'
-
-bdr.node3to1_dsn = 'dbname=node1'
-bdr.node3to1_local_dbname = 'node3'
-#bdr.node3to1_init_replica=on
-#bdr.node3to1_replica_local_dsn='dbname=node3'
-bdr.node3to2_dsn = 'dbname=node2'
-bdr.node3to2_local_dbname = 'node3'
-
 bdr.log_conflicts_to_table = True
 bdr.default_apply_delay = 100
 
 #log_min_messages = 'debug4'
 #log_line_prefix = 'd=%d p=%p a=%a%q '
-log_statement = 'all'
+#log_statement = 'all'
 
 max_worker_processes = 18
index ed20add9d402ee9edc68abfc2e304b5b1f216d4a..c698238a179089e001ce614ab56e1ad9f8e8da0d 100644 (file)
@@ -36,7 +36,7 @@ void
 bdr_label_init(void)
 {
    /* Security label provider hook */
-   register_label_provider("bdr", bdr_object_relabel);
+   register_label_provider(BDR_SECLABEL_PROVIDER, bdr_object_relabel);
 }
 
 static void
index 0103a2a98e8de8e0168e25f13eba141cd938aac0..8cba0736b4db0df675bf1dc76245e6ccc51f444f 100644 (file)
@@ -8,4 +8,6 @@
  * bdr_label.h
  */
 
+#define BDR_SECLABEL_PROVIDER "bdr"
+
 extern void bdr_label_init(void);
index 933afb08f46af1b831bdbe0064df70ac3f495105..a6fa042d47401c6e3d3887025e3ee62e6c2cb866 100644 (file)
@@ -147,9 +147,6 @@ static BdrLocksCtl *bdr_locks_ctl;
 /* shmem init hook to chain to on startup, if any */
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
-/* number of per database slots */
-static int bdr_locks_num_databases;
-
 /* this database's state */
 static BdrLocksDBState *bdr_my_locks_database = NULL;
 
@@ -161,7 +158,7 @@ bdr_locks_shmem_size(void)
    Size        size = 0;
 
    size = add_size(size, sizeof(BdrLocksCtl));
-   size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_locks_num_databases));
+   size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_max_databases));
 
    return size;
 }
@@ -188,13 +185,12 @@ bdr_locks_shmem_startup(void)
 
 /* Needs to be called from a shared_preload_library _PG_init() */
 void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
 {
    /* Must be called from postmaster its self */
    Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
 
    bdr_locks_ctl = NULL;
-   bdr_locks_num_databases = num_used_databases;
 
    RequestAddinShmemSpace(bdr_locks_shmem_size());
    RequestAddinLWLocks(1);
@@ -212,7 +208,7 @@ bdr_locks_find_database(Oid dboid, bool create)
    int off;
    int free_off = -1;
 
-   for(off = 0; off < bdr_locks_num_databases; off++)
+   for(off = 0; off < bdr_max_databases; off++)
    {
        BdrLocksDBState *db = &bdr_locks_ctl->dbstate[off];
 
@@ -243,14 +239,11 @@ bdr_locks_find_database(Oid dboid, bool create)
        db->in_use = true;
        return db;
    }
-   /*
-    * Shouldn't happen with BDR statically configured, as the shmem segment
-    * gets sized for the number of BDR-enabled databases. Later will be
-    * affected by any bdr_max_databases setting or whatever we add.
-    */
-   ereport(PANIC,
+
+   ereport(ERROR,
            (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
-           "Too many databases in use with BDR"));
+           errmsg("Too many databases BDR-enabled for bdr.max_databases"),
+           errhint("Increase bdr.max_databases above the current limit of %d", bdr_max_databases)));
 }
 
 static void
@@ -273,7 +266,7 @@ bdr_locks_find_my_database(bool create)
  * Called from the per-db worker.
  */
 void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
 {
    Relation        rel;
    SysScanDesc     scan;
@@ -296,7 +289,8 @@ bdr_locks_startup(Size nnodes)
    if (bdr_my_locks_database->locked_and_loaded)
        return;
 
-   bdr_my_locks_database->nnodes = nnodes;
+   /* We haven't yet established how many nodes we're connected to. */
+   bdr_my_locks_database->nnodes = 0;
 
    initStringInfo(&s);
 
@@ -387,6 +381,42 @@ bdr_locks_startup(Size nnodes)
    bdr_my_locks_database->locked_and_loaded = true;
 }
 
+void
+bdr_locks_set_nnodes(Size nnodes)
+{
+   Assert(IsBackgroundWorker);
+   Assert(bdr_my_locks_database != NULL);
+
+   /*
+    * XXX DYNCONF No protection against node addition during DDL lock acquire
+    *
+    * Node counts are currently grabbed straight from the perdb worker's shmem
+    * and could change whenever someone adds a worker, with no locking or
+    * protection.
+    *
+    * We could acquire the local DDL lock before setting the nodecount, which
+    * would cause requests from other nodes to get rejected and cause other
+    * local tx's to fail to request the global DDL lock. However, we'd have to
+    * acquire it when we committed to adding the new worker, which happens in
+    * a user backend, and release it from the perdb worker once the new worker
+    * is registered. Fragile.
+    *
+    * Doing so also fails to solve the other half of the problem, which is
+    * that DDL locking expects there to be one bdr walsender for each apply
+    * worker, i.e. each connection should be reciprocal. We could connect to
+    * the other end and register a connection back to us, but that's getting
+    * complicated for what's always going to be a temporary option before a
+    * full part/join protocol is added.
+    *
+    * So we're just going to cross our fingers. Worst case is that DDL locking
+    * gets stuck and we have to restart all the nodes.
+    *
+    * The full part/join protocol will solve this by acquiring the DDL lock
+    * before joining.
+    */
+   bdr_my_locks_database->nnodes = nnodes;
+}
+
 
 static void
 bdr_prepare_message(StringInfo s, BdrMessageType message_type)
@@ -506,6 +536,14 @@ bdr_acquire_ddl_lock(void)
 
    bdr_locks_find_my_database(false);
 
+   if (bdr_my_locks_database->nnodes == 0)
+   {
+       ereport(ERROR,
+               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                errmsg("No peer nodes or peer node count unknown, cannot acquire DDL lock"),
+                errhint("BDR is probably still starting up, wait a while")));
+   }
+
    elog(DEBUG2, "attempting to acquire global DDL lock for (" BDR_LOCALID_FORMAT ")", BDR_LOCALID_FORMAT_ARGS);
 
    /* send message about ddl lock */
@@ -996,7 +1034,7 @@ bdr_process_decline_ddl_lock(uint64 origin_sysid, TimeLineID origin_tli, Oid ori
  * Another node has asked us to confirm that we've replayed up to a given LSN.
  * We've seen the request message, so send the requested confirmation.
  *
- * Runs in the walsender.
+ * Runs in the apply worker.
  */
 void
 bdr_process_request_replay_confirm(uint64 sysid, TimeLineID tli,
@@ -1257,12 +1295,12 @@ bdr_locks_check_query(void)
 
 /* bdr_locks are not used by UDR at the moment */
 void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
 {
 }
 
 void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
 {
 }
 
index 5549089d119535cc9d7d3c961db511d82e5e4671..4b855f3bbcea76cd61ee3ecf2ad35e5fdc26c9ad 100644 (file)
@@ -21,7 +21,8 @@ typedef enum BdrMessageType
    BDR_MESSAGE_REPLAY_CONFIRM = 6
 } BdrMessageType;
 
-void bdr_locks_startup(Size nnodes);
+void bdr_locks_startup(void);
+void bdr_locks_set_nnodes(Size nnodes);
 void bdr_acquire_ddl_lock(void);
 void bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid);
 void bdr_process_release_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid,
index 90606c0f1ad3dc3a5dd3cbf4f8aebadf4be2caf3..e6d63355598b8dfea03d822cecb9cdf00270d223 100644 (file)
@@ -74,6 +74,7 @@ typedef struct
    bool client_float8_byval;
    bool client_int_datetime;
    char *client_db_encoding;
+   bool client_unidirectional;
    Oid bdr_schema_oid;
    Oid bdr_conflict_handlers_reloid;
    Oid bdr_locks_reloid;
@@ -228,14 +229,19 @@ bdr_req_param(const char *param)
  * If this function returns it's safe to begin replay.
  */
 static void
-bdr_ensure_node_ready()
+bdr_ensure_node_ready(BdrOutputData *data)
 {
    int spi_ret;
    const uint64 sysid = GetSystemIdentifier();
    char status;
+   BDRNodeInfo *node;
    NameData dbname;
    char *tmp_dbname;
 
+   /* Unidirectional connections don't require any checks atm. */
+   if (data->client_unidirectional)
+       return;
+
    /* We need dbname valid outside this transaction, so copy it */
    tmp_dbname = get_database_name(MyDatabaseId);
    strncpy(NameStr(dbname), tmp_dbname, NAMEDATALEN);
@@ -250,32 +256,18 @@ bdr_ensure_node_ready()
    if (spi_ret != SPI_OK_CONNECT)
        elog(ERROR, "Local SPI connect failed; shouldn't happen");
 
-   status = bdr_nodes_get_local_status(sysid, ThisTimeLineID, MyDatabaseId);
+   node = bdr_nodes_get_local_info(sysid, ThisTimeLineID, MyDatabaseId);
+   status = node == NULL ? '\0' : node->status;
+   bdr_bdr_node_free(node);
 
    SPI_finish();
 
-/*
- * There is no local node status for UDR as we have only connection to this
- * node coming from a slave. The above is still useful to make sure the
- * extension is installed in the db.
- */
-#ifdef BUILDING_UDR
-   switch (status)
-   {
-       case 'r':
-       case '\0':
-       case 'c':
-       case 'i':
-           break;
-       default:
-           elog(ERROR, "Unhandled case status=%c", status);
-           break;
-   }
-#else
-
-   /* Complain if node isn't ready. */
+   /*
+    * Complain if node isn't ready,
+    * i.e. state is fully 'r'eady, or waiting for inbound sl'o't creation.
+    */
    /* TODO: Allow soft error so caller can sleep and recheck? */
-   if (status != 'r')
+   if (status != 'r' && status != 'o')
    {
        const char * const base_msg =
            "bdr output plugin: slot creation rejected, bdr.bdr_nodes entry for local node (sysid=" UINT64_FORMAT
@@ -283,8 +275,10 @@ bdr_ensure_node_ready()
        switch (status)
        {
            case 'r':
+           case 'o':
                break; /* unreachable */
            case '\0':
+           case 'b':
                /*
                 * Can't allow replay when BDR hasn't started yet, as
                 * replica init might still need to run, causing a dump to
@@ -338,7 +332,6 @@ bdr_ensure_node_ready()
                break;
        }
    }
-#endif
 }
 
 
@@ -411,6 +404,8 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
            data->client_db_encoding = pstrdup(strVal(elem->arg));
        else if (strcmp(elem->defname, "forward_changesets") == 0)
            bdr_parse_bool(elem, &data->forward_changesets);
+       else if (strcmp(elem->defname, "unidirectional") == 0)
+           bdr_parse_bool(elem, &data->client_unidirectional);
        else if (strcmp(elem->defname, "replication_sets") == 0)
        {
            int i;
@@ -482,12 +477,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
        StartTransactionCommand();
    }
 
-#ifdef BUILDING_BDR
-   /*
-    * If running BDR, we expect the remote end (us) to have the BDR extension
-    * installed before we permit slot creation. This prevents replication of
-    * the CREATE EXTENSION bdr; command its self.
-    */
+   /* BDR extension must be installed. */
    if (get_namespace_oid("bdr", true) == InvalidOid)
    {
        ereport(ERROR,
@@ -496,7 +486,6 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
                        BDR_LOCALID_FORMAT_ARGS),
                 errdetail("Cannot create a BDR slot without the BDR extension installed")));
    }
-#endif
 
    /* no options are passed in during initialization, so don't complain there */
    if (!is_init)
@@ -521,6 +510,15 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
        if (data->client_db_encoding == NULL)
            bdr_req_param("db_encoding");
 
+#ifdef BUILDING_UDR
+       /* Can't do bidirectional connection on UDR. */
+       if (!data->is_unidirectional)
+           ereport(ERROR,
+                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                    errmsg("UDR only supports unidirectional connections")));
+
+#endif
+
        /* check incompatibilities we cannot work around */
        if (strcmp(data->client_db_encoding, GetDatabaseEncodingName()) != 0)
            elog(ERROR, "mismatching encodings are not yet supported");
@@ -583,7 +581,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
        if (data->client_pg_version / 100 != PG_VERSION_NUM / 100)
            data->allow_sendrecv_protocol = false;
 
-       bdr_maintain_schema();
+       bdr_maintain_schema(false);
 
        data->bdr_schema_oid = get_namespace_oid("bdr", true);
        schema_oid = data->bdr_schema_oid;
@@ -618,7 +616,7 @@ pg_decode_startup(LogicalDecodingContext * ctx, OutputPluginOptions *opt, bool i
         * This'll ERROR out if we're not ready. Note that this does NOT
         * prevent slot creation, only START_REPLICATION from the slot.
         */
-       bdr_ensure_node_ready();
+       bdr_ensure_node_ready(data);
    }
 
    if (tx_started)
index 644de7fa6a6d2a3f4070b7456798f2a94f5a5336..0f5c72d48012753cd066f99ca0d7daee397a819b 100644 (file)
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
 
+PG_FUNCTION_INFO_V1(bdr_connections_changed);
+
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS);
+
+/* In the commit hook, should we attempt to start a per-db worker? */
+static bool xacthook_connection_added = false;
+
+/*
+ * Scan shmem looking for a perdb worker for the named DB and
+ * return its offset. If not found, return -1.
+ *
+ * Must hold the LWLock on the worker control segment in at
+ * least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually
+ * started up.
+ */
+int
+find_perdb_worker_slot(Oid dboid, BdrWorker **worker_found)
+{
+   int i, found = -1;
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   for (i = 0; i < bdr_max_workers; i++)
+   {
+       BdrWorker *w = &BdrWorkerCtl->slots[i];
+       if (w->worker_type == BDR_WORKER_PERDB)
+       {
+           BdrPerdbWorker *pw = &w->data.perdb;
+           if (pw->database_oid == dboid)
+           {
+               found = i;
+               if (worker_found != NULL)
+                   *worker_found = w;
+               break;
+           }
+       }
+   }
+
+   return found;
+}
+
+/*
+ * Scan shmem looking for an apply worker for the current perdb worker and
+ * specified target node identifier and return its offset. If not found, return
+ * -1.
+ *
+ * Must hold the LWLock on the worker control segment in at least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually started up.
+ */
+static int
+find_apply_worker_slot(uint64 sysid, TimeLineID timeline, Oid dboid, BdrWorker **worker_found)
+{
+   int i, found = -1;
+
+   Assert(bdr_worker_type == BDR_WORKER_PERDB);
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   for (i = 0; i < bdr_max_workers; i++)
+   {
+       BdrWorker *w = &BdrWorkerCtl->slots[i];
+       if (w->worker_type == BDR_WORKER_APPLY)
+       {
+           BdrApplyWorker *aw = &w->data.apply;
+           if (aw->dboid == MyDatabaseId
+               && aw->remote_sysid == sysid
+               && aw->remote_timeline == timeline
+               && aw->remote_dboid == dboid)
+           {
+               found = i;
+               if (worker_found != NULL)
+                   *worker_found = w;
+               break;
+           }
+       }
+   }
+
+   return found;
+}
+
+static void
+bdr_perdb_xact_callback(XactEvent event, void *arg)
+{
+   switch (event)
+   {
+       case XACT_EVENT_COMMIT:
+           if (xacthook_connection_added)
+           {
+               int slotno;
+               BdrWorker *w;
+
+               xacthook_connection_added = false;
+
+               LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+               /*
+                * If a perdb worker already exists, wake it and tell it to
+                * check for new connections.
+                */
+               slotno = find_perdb_worker_slot(MyDatabaseId, &w);
+               if (slotno >= 0)
+               {
+                   /*
+                    * The worker is registered, but might not be started yet
+                    * (or could be crashing and restarting). If it's not
+                    * started the latch will be zero. If it's started but
+                    * dead, the latch will be bogus, but it's safe to set a
+                    * proclatch to a dead process. At worst we'll set a latch
+                    * for the wrong process, and that's fine. If it's zero
+                    * then the worker is still starting and will see our new
+                    * changes anyway.
+                    */
+                   if (w->data.perdb.proclatch != NULL)
+                       SetLatch(w->data.perdb.proclatch);
+               }
+               else
+               {
+                   /*
+                    * Per-db worker doesn't exist, ask the supervisor to check for
+                    * changes and register new per-db workers for labeled
+                    * databases.
+                    */
+                   if (BdrWorkerCtl->supervisor_latch)
+                       SetLatch(BdrWorkerCtl->supervisor_latch);
+               }
+
+               LWLockRelease(BdrWorkerCtl->lock);
+           }
+           break;
+       default:
+           /* We're not interested in other tx events */
+           break;
+   }
+}
+
+/*
+ * Prepare to launch a perdb worker for the current DB if it's not already
+ * running, and register a XACT_EVENT_COMMIT hook to perform the actual launch
+ * when the addition of the worker commits.
+ *
+ * If a perdb worker is already running, notify it to check for new connections.
+ */
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS)
+{
+   /* If there's already a per-db worker for our DB we have nothing to do */
+   if (!xacthook_connection_added)
+   {
+       RegisterXactCallback(bdr_perdb_xact_callback, NULL);
+       xacthook_connection_added = true;
+   }
+   PG_RETURN_VOID();
+}
+
+static int
+getattno(const char *colname)
+{
+   int attno;
+
+   attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+   if (attno == SPI_ERROR_NOATTRIBUTE)
+       elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+   return attno;
+}
+
 /*
  * Launch a dynamic bgworker to run bdr_apply_main for each bdr connection on
  * the database identified by dbname.
  *
- * Scans the BdrWorkerCtl shmem segment for workers of type BDR_WORKER_APPLY
- * with a matching database name and launches them.
+ * Scans the bdr.bdr_connections table for workers and launch a worker for any
+ * connection that doesn't already have one.
  */
-static List*
-bdr_launch_apply_workers(char *dbname)
+void
+bdr_launch_apply_workers(Oid dboid)
 {
-   List             *apply_workers = NIL;
-   BackgroundWorker  apply;
-   int               i;
-
+   BackgroundWorker    bgw;
+   int                 i, ret;
+   Size                nnodes = 0;
+#define BDR_CON_Q_NARGS 3
+   Oid                 argtypes[BDR_CON_Q_NARGS] = { TEXTOID, OIDOID, OIDOID };
+   Datum               values[BDR_CON_Q_NARGS];
+   char                sysid_str[33];
+
+   /* Should be called from the perdb worker */
    Assert(IsBackgroundWorker);
+   Assert(bdr_worker_type == BDR_WORKER_PERDB);
+
+   snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+   sysid_str[sizeof(sysid_str)-1] = '\0';
+
+   elog(DEBUG2, "launching apply workers");
+
+   /*
+    * It's easy enough to make this tolerant of an open tx, but in general
+    * rollback doesn't make sense here.
+    */
+   Assert(!IsTransactionState());
 
    /* Common apply worker values */
-   apply.bgw_flags = BGWORKER_SHMEM_ACCESS |
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
        BGWORKER_BACKEND_DATABASE_CONNECTION;
-   apply.bgw_start_time = BgWorkerStart_RecoveryFinished;
-   apply.bgw_main = NULL;
-   strncpy(apply.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
-   strncpy(apply.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
-   apply.bgw_restart_time = 5;
-   apply.bgw_notify_pid = 0;
-
-   /* Launch apply workers */
-   LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
-   for (i = 0; i < bdr_max_workers; i++)
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 5;
+   bgw.bgw_notify_pid = 0;
+
+   StartTransactionCommand();
+
+   /*
+    * Look up connection entries for all nodes other than our own.
+    *
+    * If an entry with our origin (sysid,tlid,dboid) exists, treat that as
+    * overriding the generic one.
+    */
+   values[0] = CStringGetTextDatum(sysid_str);
+   values[1] = ObjectIdGetDatum(ThisTimeLineID);
+   values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+   SPI_connect();
+
+   ret = SPI_execute_with_args(
+           "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+           "  conn_sysid, conn_timeline, conn_dboid, "
+           "  conn_is_unidirectional, "
+           "  conn_origin_dboid <> 0 AS origin_is_my_id "
+           "FROM bdr.bdr_connections "
+           "WHERE ( "
+           "         (conn_origin_sysid = '0' AND "
+           "          conn_origin_timeline = 0 AND "
+           "          conn_origin_dboid = 0) "
+           "         OR "
+           "         (conn_origin_sysid = $1 AND "
+           "          conn_origin_timeline = $2 AND "
+           "          conn_origin_dboid = $3) "
+           "      ) AND NOT ( "
+           "          conn_sysid = $1 AND "
+           "          conn_timeline = $2 AND "
+           "          conn_dboid = $3"
+           "      ) "
+           "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+           "         conn_origin_sysid ASC NULLS LAST, "
+           "         conn_timeline ASC NULLS LAST, "
+           "         conn_dboid ASC NULLS LAST ",
+       BDR_CON_Q_NARGS, argtypes, values, NULL,
+       false, 0);
+
+   if (ret != SPI_OK_SELECT)
+       elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+   nnodes = SPI_processed;
+
+   elog(DEBUG2, "found %u workers in bdr_connections", (uint32)nnodes);
+
+   for (i = 0; i < SPI_processed; i++)
    {
-       BdrWorker *worker = &BdrWorkerCtl->slots[i];
+       BackgroundWorkerHandle *bgw_handle;
+       HeapTuple               tuple;
+       uint32                  slot;
+       uint32                  worker_arg;
+       BdrWorker              *worker;
+       BdrApplyWorker         *apply;
+       Datum                   temp_datum;
+       bool                    isnull;
+       uint64                  target_sysid;
+       TimeLineID              target_timeline;
+       Oid                     target_dboid;
+       char*                   tmp_sysid;
+       bool                    origin_is_my_id,
+                               conn_is_unidirectional;
+
+       tuple = SPI_tuptable->vals[i];
+
+       tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+                                getattno("conn_sysid"));
+
+       if (sscanf(tmp_sysid, UINT64_FORMAT, &target_sysid) != 1)
+           elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_timeline"),
+                                  &isnull);
+       Assert(!isnull);
+       target_timeline = DatumGetObjectId(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_dboid"),
+                                  &isnull);
+       Assert(!isnull);
+       target_dboid = DatumGetObjectId(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("conn_is_unidirectional"),
+                                  &isnull);
+       Assert(!isnull);
+       conn_is_unidirectional = DatumGetBool(temp_datum);
+
+       temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+                                  getattno("origin_is_my_id"),
+                                  &isnull);
+       Assert(!isnull);
+       origin_is_my_id = DatumGetBool(temp_datum);
+
+       elog(DEBUG2, "Found bdr_connections entry for "BDR_LOCALID_FORMAT" (origin specific: %d, unidirectional: %d)",
+            target_sysid, target_timeline, target_dboid,
+            EMPTY_REPLICATION_NAME, (int)origin_is_my_id, (int)conn_is_unidirectional);
+
+       Assert(!LWLockHeldByMe(BdrWorkerCtl->lock));
+       LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
 
-       switch(worker->worker_type)
+       /*
+        * Is there already a worker registered for this connection?
+        *
+        * TODO DYNCONF Each apply worker should have its latch set and respond
+        * by checking to see whether it needs to apply any new configuration.
+        */
+       if (find_apply_worker_slot(target_sysid, target_timeline, target_dboid, NULL) != -1)
        {
-           case BDR_WORKER_APPLY:
-               {
-                   BdrApplyWorker *con = &worker->data.apply;
-                   BdrConnectionConfig *cfg =
-                       bdr_connection_configs[con->connection_config_idx];
-                   Assert(cfg != NULL);
-                   if ( strcmp(cfg->dbname, dbname) == 0 )
-                   {
-                       /* It's an apply worker for our DB; register it */
-                       BackgroundWorkerHandle *bgw_handle;
-
-                       if (con->bgw_is_registered)
-                           /*
-                            * This worker was registered on a previous pass;
-                            * this is probably a restart of the per-db worker.
-                            * Don't register a duplicate.
-                            */
-                           continue;
-
-                       snprintf(apply.bgw_name, BGW_MAXLEN,
-                                BDR_LOCALID_FORMAT": %s: apply",
-                                BDR_LOCALID_FORMAT_ARGS, cfg->name);
-                       apply.bgw_main_arg = Int32GetDatum(i);
-
-                       if (!RegisterDynamicBackgroundWorker(&apply,
-                                                            &bgw_handle))
-                       {
-                           ereport(ERROR,
-                                   (errmsg("bdr: Failed to register background worker"
-                                           " %s, see previous log messages",
-                                           cfg->name)));
-                       }
-                       /* We've launched this one, don't do it again */
-                       con->bgw_is_registered = true;
-                       apply_workers = lcons(bgw_handle, apply_workers);
-                   }
-               }
-               break;
-           case BDR_WORKER_EMPTY_SLOT:
-           case BDR_WORKER_PERDB:
-               /* Nothing to do; switch only so we get warnings for insane cases */
-               break;
-           default:
-               /* Bogus value */
-               elog(FATAL, "Unhandled BdrWorkerType case %i, memory corruption?",
-                    worker->worker_type);
-               break;
+           elog(DEBUG2, "Skipping registration of worker for node "BDR_LOCALID_FORMAT" on db oid=%u: already registered",
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME, dboid);
+           LWLockRelease(BdrWorkerCtl->lock);
+           continue;
+       }
+
+       /* Set the display name in 'ps' etc */
+       snprintf(bgw.bgw_name, BGW_MAXLEN,
+                BDR_LOCALID_FORMAT"->"BDR_LOCALID_FORMAT,
+                BDR_LOCALID_FORMAT_ARGS,
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME);
+
+       /* Allocate a new shmem slot for this apply worker */
+       worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &slot);
+
+       /* Tell the apply worker what its shmem slot is */
+       Assert(slot <= UINT16_MAX);
+       worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)slot;
+       bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+       /*
+        * Apply workers (other than in catchup mode, which are registered
+        * elsewhere) should not be using the local node's connection entry.
+        */
+       Assert(!(target_sysid == GetSystemIdentifier() &&
+                target_timeline == ThisTimeLineID &&
+                target_dboid == MyDatabaseId));
+
+       /* Now populate the apply worker state */
+       apply = &worker->data.apply;
+       apply->dboid = MyDatabaseId;
+       apply->remote_sysid = target_sysid;
+       apply->remote_timeline = target_timeline;
+       apply->remote_dboid = target_dboid;
+       apply->replay_stop_lsn = InvalidXLogRecPtr;
+       apply->forward_changesets = false;
+
+       LWLockRelease(BdrWorkerCtl->lock);
+
+       /*
+        * Finally, register the worker for launch.
+        */
+       if (!RegisterDynamicBackgroundWorker(&bgw,
+                                            &bgw_handle))
+       {
+           /*
+            * Already-registered workers will keep on running.  We need to
+            * make sure the slot we just acquired but failed to launch a
+            * worker for gets released again though.
+            */
+           LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+           apply->dboid = InvalidOid;
+           apply->remote_sysid = 0;
+           apply->remote_timeline = 0;
+           apply->remote_dboid = InvalidOid;
+           worker->worker_type = BDR_WORKER_EMPTY_SLOT;
+           LWLockRelease(BdrWorkerCtl->lock);
+
+           ereport(ERROR,
+                   (errmsg("bdr: Failed to register background worker"
+                           " for "BDR_LOCALID_FORMAT", see previous log messages",
+                           BDR_LOCALID_FORMAT_ARGS)));
+       }
+       else
+       {
+           elog(DEBUG2, "registered apply worker for "BDR_LOCALID_FORMAT,
+                target_sysid, target_timeline, target_dboid,
+                EMPTY_REPLICATION_NAME);
        }
    }
-   LWLockRelease(BdrWorkerCtl->lock);
 
-   return apply_workers;
+   SPI_finish();
+
+   CommitTransactionCommand();
+
+   elog(DEBUG2, "done registering apply workers");
+
+   /*
+    * Now we need to tell the lock manager and the sequence
+    * manager about the changed node count.
+    *
+    * There's no truly safe way to do this without a proper
+    * part/join protocol, so all we're going to do is update
+    * the node count in shared memory.
+    */
+   bdr_worker_slot->data.perdb.nnodes = nnodes;
+#ifdef BUILDING_BDR
+   bdr_locks_set_nnodes(nnodes);
+   bdr_sequencer_set_nnodes(nnodes);
+#endif
+
+   elog(DEBUG2, "updated worker counts");
 }
 
 /*
@@ -148,60 +479,106 @@ bdr_launch_apply_workers(char *dbname)
 void
 bdr_perdb_worker_main(Datum main_arg)
 {
-   int               rc = 0;
-   List             *apply_workers;
-   ListCell         *c;
-   BdrPerdbWorker   *perdb;
-   BdrWorker        *bdr_worker_slot;
-   StringInfoData    si;
-   bool              wait;
+   int                 rc = 0;
+   BdrPerdbWorker      *perdb;
+   StringInfoData      si;
+   bool                wait;
+   uint32              worker_arg;
+   uint16              worker_generation;
+   uint16              perdb_worker_idx;
+   BDRNodeInfo        *local_node;
 
    initStringInfo(&si);
 
    Assert(IsBackgroundWorker);
 
-   bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+   worker_arg = DatumGetInt32(main_arg);
+
+   worker_generation = (uint16)(worker_arg >> 16);
+   perdb_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+   if (worker_generation != BdrWorkerCtl->worker_generation)
+   {
+       elog(DEBUG1, "perdb worker from generation %d exiting after finding shmem generation is %d",
+            worker_generation, BdrWorkerCtl->worker_generation);
+       proc_exit(0);
+   }
+
+   bdr_worker_slot = &BdrWorkerCtl->slots[perdb_worker_idx];
    Assert(bdr_worker_slot->worker_type == BDR_WORKER_PERDB);
    perdb = &bdr_worker_slot->data.perdb;
    bdr_worker_type = BDR_WORKER_PERDB;
 
    bdr_worker_init(NameStr(perdb->dbname));
 
+   perdb->nnodes = 0;
+
    elog(DEBUG1, "per-db worker for node " BDR_LOCALID_FORMAT " starting", BDR_LOCALID_FORMAT_ARGS);
 
-   appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb worker");
+   appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb");
    SetConfigOption("application_name", si.data, PGC_USERSET, PGC_S_SESSION);
 
    CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr seq top-level resource owner");
    bdr_saved_resowner = CurrentResourceOwner;
 
+   /*
+    * It's necessary to acquire a a lock here so that a concurrent
+    * bdr_perdb_xact_callback can't try to set our latch at the same
+    * time as we write to it.
+    *
+    * There's no per-worker lock, so we just take the lock on the
+    * whole segment.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+   perdb->proclatch = &MyProc->procLatch;
+   perdb->database_oid = MyDatabaseId;
+   LWLockRelease(BdrWorkerCtl->lock);
+
    /* need to be able to perform writes ourselves */
    bdr_executor_always_allow_writes(true);
-   bdr_locks_startup(perdb->nnodes);
+   bdr_locks_startup();
+
+   {
+       int             spi_ret;
+       MemoryContext   saved_ctx;
+
+       /*
+        * Check the local bdr.bdr_nodes table to see if there's an entry for
+        * our node.
+        *
+        * Note that we don't have to explicitly SPI_finish(...) on error paths;
+        * that's taken care of for us.
+        */
+       StartTransactionCommand();
+       spi_ret = SPI_connect();
+       if (spi_ret != SPI_OK_CONNECT)
+           elog(ERROR, "SPI already connected; this shouldn't be possible");
+
+       saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+       local_node = bdr_nodes_get_local_info(GetSystemIdentifier(), ThisTimeLineID,
+                                         MyDatabaseId);
+       MemoryContextSwitchTo(saved_ctx);
+
+       if (local_node == NULL)
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("local node record not found")));
+
+       SPI_finish();
+       CommitTransactionCommand();
+   }
 
    /*
     * Do we need to init the local DB from a remote node?
-    *
-    * Checks bdr.bdr_nodes.status, does any remote initialization required if
-    * there's an init_replica connection, and ensures that
-    * bdr.bdr_nodes.status=r for our entry before continuing.
     */
-   bdr_init_replica(&perdb->dbname);
+   if (local_node->status != 'r')
+       bdr_init_replica(local_node);
 
-   elog(DEBUG1, "Starting bdr apply workers for db %s", NameStr(perdb->dbname));
+   elog(DEBUG1, "Starting bdr apply workers for "BDR_LOCALID_FORMAT" (%s)",
+        BDR_LOCALID_FORMAT_ARGS, NameStr(perdb->dbname));
 
    /* Launch the apply workers */
-   apply_workers = bdr_launch_apply_workers(NameStr(perdb->dbname));
-
-   /*
-    * For now, just free the bgworker handles. Later we'll probably want them
-    * for adding/removing/reconfiguring bgworkers.
-    */
-   foreach(c, apply_workers)
-   {
-       BackgroundWorkerHandle *h = (BackgroundWorkerHandle *) lfirst(c);
-       pfree(h);
-   }
+   bdr_launch_apply_workers(MyDatabaseId);
 
 #ifdef BUILDING_BDR
    elog(DEBUG1, "BDR starting sequencer on db \"%s\"",
@@ -260,8 +637,18 @@ bdr_perdb_worker_main(Datum main_arg)
            /* emergency bailout if postmaster has died */
            if (rc & WL_POSTMASTER_DEATH)
                proc_exit(1);
+
+           if (rc & WL_LATCH_SET)
+           {
+               /*
+                * If the perdb worker's latch is set we're being asked
+                * to rescan and launch new apply workers.
+                */
+               bdr_launch_apply_workers(MyDatabaseId);
+           }
        }
    }
 
+   perdb->database_oid = InvalidOid;
    proc_exit(0);
 }
index 47d1fe7978b25c9500f8c714ca8e6d3731e7b5f0..c5ae0546bb612ae0d3edf3c087cacce0de5151dd 100644 (file)
@@ -2,16 +2,6 @@ track_commit_timestamp = on
 
 include = 'bdr_regress_common.conf'
 
-bdr.connections = 'node1, node2'
-
-bdr.node1_dsn = 'dbname=postgres'
-bdr.node1_local_dbname = 'regression'
-bdr.node1_replication_sets = 'default, important, for-node-1'
-
-bdr.node2_dsn = 'dbname=regression'
-bdr.node2_local_dbname = 'postgres'
-bdr.node2_replication_sets = 'default, important, for-node-2, for-node-2-insert, for-node-2-update, for-node-2-delete'
-
 bdrtest.readdb1 = 'regression'
 bdrtest.readdb2 = 'postgres'
 bdrtest.writedb1 = 'regression'
index e386a10a6b169a48fbc4b021b3afa2792960370e..d0f46c2be11f84dd7b5631cc85936a680c22b314 100644 (file)
 
 #include "bdr.h"
 
+#include "access/genam.h"
 #include "access/heapam.h"
 #include "access/xact.h"
 
 #include "commands/seclabel.h"
 
+#include "utils/builtins.h"
 #include "utils/catcache.h"
+#include "utils/fmgroids.h"
 #include "utils/inval.h"
 #include "utils/jsonapi.h"
 #include "utils/json.h"
@@ -324,10 +327,6 @@ relation_in_replication_set(BDRRelation *r, const char *setname)
    return false;
 }
 
-#include "access/genam.h"
-#include "utils/builtins.h"
-#include "utils/fmgroids.h"
-
 static HeapTuple
 replset_lookup(Relation rel, const char *cname)
 {
index 0c1742e8eefa9571376046f637173645b137a887..ac9602e029853812e7ec640f906a6ec24d4b563c 100644 (file)
--- a/bdr_seq.c
+++ b/bdr_seq.c
@@ -50,7 +50,7 @@ typedef struct BdrSequencerSlot
 
 typedef struct BdrSequencerControl
 {
-   size_t      slot;
+   int         next_slot;
    BdrSequencerSlot slots[FLEXIBLE_ARRAY_MEMBER];
 } BdrSequencerControl;
 
@@ -429,6 +429,12 @@ bdr_sequencer_shmem_startup(void)
    {
        /* initialize */
        memset(BdrSequencerCtl, 0, bdr_sequencer_shmem_size());
+       /*
+        * next_slot allows perdb workers to allocate seq slots.
+        * The sequencer will likely be separated into a different
+        * worker later.
+        */
+       BdrSequencerCtl->next_slot = 0;
    }
    LWLockRelease(AddinShmemInitLock);
 
@@ -450,6 +456,19 @@ bdr_sequencer_shmem_init(int nnodes, int sequencers)
    shmem_startup_hook = bdr_sequencer_shmem_startup;
 }
 
+/*
+ * The perdb worker doing sequencer setup needs to know what slot to
+ * allocate for the next sequencer.
+ *
+ * This should go away once the sequencer is separated into its own
+ * worker.
+ */
+int
+bdr_sequencer_get_next_free_slot(void)
+{
+   return BdrSequencerCtl->next_slot ++;
+}
+
 void
 bdr_sequencer_wakeup(void)
 {
@@ -509,6 +528,13 @@ bdr_schedule_eoxact_sequencer_wakeup(void)
    bdr_seq_pending_wakeup = true;
 }
 
+void
+bdr_sequencer_set_nnodes(Size nnodes)
+{
+   BdrSequencerSlot *slot = &BdrSequencerCtl->slots[seq_slot];
+   slot->nnodes = nnodes;
+}
+
 void
 bdr_sequencer_init(int new_seq_slot, Size nnodes)
 {
diff --git a/bdr_supervisor.c b/bdr_supervisor.c
new file mode 100644 (file)
index 0000000..4353c45
--- /dev/null
@@ -0,0 +1,441 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_supervisor.c
+ *     Cluster wide supervisor worker.
+ *
+ * Copyright (C) 2014-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     bdr_supervisor.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+#include "bdr_label.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "access/xact.h"
+
+#include "catalog/objectaddress.h"
+#include "catalog/pg_database.h"
+#include "catalog/pg_shseclabel.h"
+
+#include "commands/dbcommands.h"
+#include "commands/seclabel.h"
+
+#include "postmaster/bgworker.h"
+
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/ipc.h"
+
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/fmgroids.h"
+#include "utils/guc.h"
+
+/*
+ * Register a new perdb worker for the named database. The worker MUST
+ * not already exist.
+ *
+ * This is called by the supervisor during startup, and by user backends when
+ * the first connection is added for a database.
+ */
+static void
+bdr_register_perdb_worker(const char * dbname)
+{
+   BackgroundWorkerHandle *bgw_handle;
+   BackgroundWorker        bgw;
+   BdrWorker              *worker;
+   BdrPerdbWorker         *perdb;
+   unsigned int            worker_slot_number;
+   uint32                  worker_arg;
+
+   Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+   elog(DEBUG2, "Registering per-db worker for db %s", dbname);
+
+   worker = bdr_worker_shmem_alloc(
+               BDR_WORKER_PERDB,
+               &worker_slot_number
+           );
+
+   perdb = &worker->data.perdb;
+
+   strncpy(NameStr(perdb->dbname),
+           dbname, NAMEDATALEN);
+   NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
+   /* Nodecount is set when apply workers are registered */
+   perdb->nnodes = 0;
+#ifdef BUILDING_BDR
+   perdb->seq_slot = bdr_sequencer_get_next_free_slot();
+#endif
+
+   /*
+    * The rest of the perdb worker's shmem segment - proclatch
+    * and nnodes - gets set up by the worker during startup.
+    */
+
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 5;
+   bgw.bgw_notify_pid = 0;
+   snprintf(bgw.bgw_name, BGW_MAXLEN,
+            "bdr db: %s", dbname);
+
+   /*
+    * The main arg is composed of two uint16 parts - the worker
+    * generation number (see bdr_worker_shmem_startup) and the index into
+    * BdrWorkerCtl->slots in shared memory.
+    */
+   Assert(worker_slot_number <= UINT16_MAX);
+   worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_slot_number;
+   bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+   if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+   {
+       ereport(ERROR,
+               (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+                errmsg("Registering BDR worker failed, check prior log messages for details")));
+   }
+
+   elog(DEBUG2, "Registered per-db worker for %s successfully", dbname);
+}
+
+/*
+ * Check for BDR-enabled DBs and start per-db workers for any that currently
+ * lack them.
+ *
+ * TODO DYNCONF: Handle removal of BDR from DBs
+ */
+static void
+bdr_supervisor_rescan_dbs()
+{
+   Relation    secrel;
+   ScanKeyData skey[2];
+   SysScanDesc scan;
+   HeapTuple   secTuple;
+   int         n_new_workers = 0, bdr_dbs = 0;
+
+   elog(DEBUG1, "Supervisor scanning for BDR-enabled databases");
+
+   pgstat_report_activity(STATE_RUNNING, "scanning backends");
+
+   StartTransactionCommand();
+
+   /*
+    * Scan pg_seclabel looking for entries for pg_database with the bdr label
+    * provider. We'll find all labels for the BDR provider, irrespective
+    * of value.
+    *
+    * The only index present isn't much use for this scan and using it makes
+    * us set up more keys, so do a heap scan.
+    *
+    * The lock taken on pg_shseclabel must be strong enough to conflict with
+    * the lock taken be bdr.bdr_connection_add(...) to ensure that any
+    * transactions adding new labels have commited and cleaned up before we
+    * read it. Otherwise a race between the supervisor latch being set in a
+    * commit hook and the tuples actually becoming visible is possible.
+    */
+   secrel = heap_open(SharedSecLabelRelationId, RowShareLock);
+
+   ScanKeyInit(&skey[0],
+               Anum_pg_shseclabel_classoid,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(DatabaseRelationId));
+
+   ScanKeyInit(&skey[1],
+               Anum_pg_shseclabel_provider,
+               BTEqualStrategyNumber, F_TEXTEQ,
+               CStringGetTextDatum(BDR_SECLABEL_PROVIDER));
+
+   scan = systable_beginscan(secrel, InvalidOid, false, NULL, 2, &skey[0]);
+
+   /*
+    * We need to scan the shmem segment that tracks BDR workers and possibly
+    * modify it, so lock it.
+    *
+    * We have to take an exclusive lock in case we need to modify it,
+    * otherwise we'd be faced with a lock upgrade.
+    */
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+   /*
+    * Now examine each label and if there's no worker for the labled
+    * DB already, start one.
+    */
+   while (HeapTupleIsValid(secTuple = systable_getnext(scan)))
+   {
+       FormData_pg_shseclabel *sec;
+       char                   *label_dbname;
+
+       sec = (FormData_pg_shseclabel*) GETSTRUCT(secTuple);
+
+       /*
+        * The per-db workers are mapped by name not oid, and that's necessary
+        * because the bgworker API requires that databases be identified by
+        * name.
+        *
+        * Look up the name of the DB with this OID and compare it. It's a bit slow,
+        * but we aren't doing this much.
+        *
+        * FIXME: Currently if a database is renamed, you'll have to restart
+        * PostgreSQL before BDR notices.
+        */
+       label_dbname = get_database_name(sec->objoid);
+
+       if (!bdr_is_bdr_activated_db(sec->objoid))
+       {
+           pfree(label_dbname);
+           continue;
+       }
+
+       elog(DEBUG1, "Found BDR-enabled database %s (oid=%i)",
+            label_dbname, sec->objoid);
+
+       bdr_dbs++;
+
+       /*
+        * Check if we have a per-db worker for this db oid already and if
+        * we don't, start one.
+        *
+        * This is O(n^2) for n BDR-enabled DBs; to be more scalable we could
+        * accumulate and sort the oids, then do a single scan of the shmem
+        * segment. But really, if you have that many DBs this cost is nothing.
+        */
+       if (find_perdb_worker_slot(sec->objoid, NULL) == -1)
+       {
+           /* No perdb worker exists for this DB, make one */
+           bdr_register_perdb_worker(label_dbname);
+           n_new_workers++;
+       } else {
+           elog(DEBUG2, "per-db worker for db %s already exists, not registering",
+                label_dbname);
+       }
+
+       pfree(label_dbname);
+   }
+
+   elog(DEBUG2, "Found %i BDR-labeled DBs; registered %i new per-db workers",
+        bdr_dbs, n_new_workers);
+
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   systable_endscan(scan);
+   heap_close(secrel, RowShareLock);
+
+   CommitTransactionCommand();
+
+   elog(DEBUG2, "Finished scanning for BDR-enabled databases");
+
+   pgstat_report_activity(STATE_IDLE, NULL);
+}
+
+/*
+ * Create the database the supervisor remains connected
+ * to, a DB with no user connections permitted.
+ *
+ * This is a workaorund for the inability to use pg_shseclabel
+ * without a DB connection; see comments in bdr_supervisor_main
+ */
+static void
+bdr_supervisor_createdb()
+{
+   Oid dboid;
+
+   StartTransactionCommand();
+
+   /* If the DB already exists, no need to create it */
+   dboid = get_database_oid("bdr", true);
+
+   if (dboid == InvalidOid)
+   {
+       CreatedbStmt stmt;
+       DefElem de_template;
+       DefElem de_connlimit;
+
+       de_template.defname = "template";
+       de_template.type = T_String;
+       de_template.arg = (Node*) makeString("template1");
+
+       de_connlimit.defname = "connectionlimit";
+       de_template.type = T_Integer;
+       de_connlimit.arg = (Node*) makeInteger(1);
+
+       stmt.dbname = "bdr";
+       stmt.options = list_make2(&de_template, &de_connlimit);
+
+       dboid = createdb(&stmt);
+
+       if (dboid == InvalidOid)
+           elog(ERROR, "Failed to create 'bdr' DB");
+
+       /* TODO DYNCONF: Add a comment to the db, and/or a dummy table */
+
+       elog(LOG, "Created database 'bdr' (oid=%i) during BDR startup", dboid);
+   }
+   else
+   {
+       elog(DEBUG3, "Database 'bdr' (oid=%i) already exists, not creating", dboid);
+   }
+
+   CommitTransactionCommand();
+
+   Assert(dboid != InvalidOid);
+}
+
+
+/*
+ * The BDR supervisor is a static bgworker that serves as the master/supervisor
+ * for all BDR workers. It exists so that BDR can be enabled and disabled
+ * dynamically for databases.
+ *
+ * It is responsible for identifying BDR-enabled databases at startup and
+ * launching their dynamic per-db workers. It should do as little else as
+ * possible, as it'll run when BDR is in shared_preload_libraries whether
+ * or not it's otherwise actually in use.
+ *
+ * The supervisor worker has no access to any database.
+ */
+void
+bdr_supervisor_worker_main(Datum main_arg)
+{
+   Assert(DatumGetInt32(main_arg) == 0);
+   Assert(IsBackgroundWorker);
+
+   pqsignal(SIGHUP, bdr_sighup);
+   pqsignal(SIGTERM, bdr_sigterm);
+   BackgroundWorkerUnblockSignals();
+
+   /*
+    * Unfortunately we currently can't access shared catalogs like
+    * pg_shseclabel (where we store information about which database use bdr)
+    * without being connected to a database. Only shared & nailed catalogs
+    * can be accessed before being connected to a database - and
+    * pg_shseclabel is not one of those.
+    *
+    * Instead we have a database "bdr" that's supposed to be empty which we
+    * just use to read pg_shseclabel. Not pretty, but it works.
+    *
+    * Without copying significant parts of InitPostgres() we can't even read
+    * pg_database without connecting to a database.  As we can't connect to
+    * "no database", we must connect to one that always exists, like
+    * template1, then use it to create a dummy database to operate in.
+    *
+    * Once created we set a shmem flag and restart so we know we can connect
+    * to the newly created database.
+    */
+   if (!BdrWorkerCtl->is_supervisor_restart)
+   {
+       BackgroundWorkerInitializeConnection("template1", NULL);
+       bdr_supervisor_createdb();
+
+       BdrWorkerCtl->is_supervisor_restart = true;
+
+       elog(DEBUG1, "BDR supervisor restarting to connect to 'bdr' DB");
+       proc_exit(1);
+   }
+
+   BackgroundWorkerInitializeConnection("bdr", NULL);
+
+   LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+   BdrWorkerCtl->supervisor_latch = &MyProc->procLatch;
+   LWLockRelease(BdrWorkerCtl->lock);
+
+   elog(DEBUG1, "BDR supervisor connected to DB 'bdr'");
+
+   SetConfigOption("application_name", "bdr supervisor", PGC_USERSET, PGC_S_SESSION);
+
+   /* mark as idle, before starting to loop */
+   pgstat_report_activity(STATE_IDLE, NULL);
+
+   bdr_supervisor_rescan_dbs();
+
+   while (!got_SIGTERM)
+   {
+       int rc;
+
+       /*
+        * After startup the supervisor doesn't currently have anything to do,
+        * so it can just go to sleep on its latch. It could exit after running
+        * startup, but we're expecting to need it to do other things down the
+        * track, so might as well keep it alive...
+        */
+       rc = WaitLatch(&MyProc->procLatch,
+                      WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+                      180000L);
+
+       ResetLatch(&MyProc->procLatch);
+
+       /* emergency bailout if postmaster has died */
+       if (rc & WL_POSTMASTER_DEATH)
+           proc_exit(1);
+
+       if (got_SIGHUP)
+       {
+           got_SIGHUP = false;
+           ProcessConfigFile(PGC_SIGHUP);
+       }
+
+       if (rc & WL_LATCH_SET)
+       {
+           /*
+            * We've been asked to launch new perdb workers if there are any
+            * changes to security labels.
+            */
+           bdr_supervisor_rescan_dbs();
+       }
+   }
+
+   proc_exit(0);
+}
+
+/*
+ * Register the BDR supervisor bgworker, which will start all the
+ * per-db workers.
+ *
+ * Called in postmaster context from _PG_init.
+ *
+ * The supervisor is guaranteed to be assigned the first shmem slot in our
+ * workers shmem array. This is vital because at this point shemem isn't
+ * allocated yet, so all we can do is tell the supervisor worker its shmem slot
+ * number then actually populate that slot when the postmaster runs our shmem
+ * init callback later.
+ */
+void
+bdr_supervisor_register()
+{
+   BackgroundWorker bgw;
+
+   Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
+
+   /*
+    * The supervisor worker accesses shared relations, but does not connect to
+    * any specific database. We still have to flag it as using a connection in
+    * the bgworker API.
+    */
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+       BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   bgw.bgw_main = NULL;
+   strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+   strncpy(bgw.bgw_function_name, "bdr_supervisor_worker_main", BGW_MAXLEN);
+   bgw.bgw_restart_time = 1;
+   bgw.bgw_notify_pid = 0;
+   snprintf(bgw.bgw_name, BGW_MAXLEN,
+            "bdr supervisor");
+   bgw.bgw_main_arg = Int32GetDatum(0); /* unused */
+
+   RegisterBackgroundWorker(&bgw);
+}
diff --git a/bdr_upgrade.c b/bdr_upgrade.c
new file mode 100644 (file)
index 0000000..cb39688
--- /dev/null
@@ -0,0 +1,639 @@
+/* -------------------------------------------------------------------------
+ *
+ * bdr_upgrade.c
+ *     Support for upgrading between BDR versions
+ *
+ * Copyright (C) 2012-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *     bdr_upgrade.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+#include "libpq-fe.h"
+#include "miscadmin.h"
+
+#include "libpq/pqformat.h"
+
+#include "catalog/pg_type.h"
+
+#include "storage/ipc.h"
+
+PGDLLEXPORT Datum bdr_upgrade_to_090(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(bdr_upgrade_to_090);
+
+static void
+bdr_upgrade_to_090_insert_connection( PGconn *conn,
+       const char *local_sysid, const char *local_timeline,
+       const char *local_dboid, const char *my_conninfo)
+{
+   PGresult        *res;
+   const char      *values[8];
+   Oid             types[8] =
+       { TEXTOID, OIDOID, OIDOID, TEXTOID, OIDOID, OIDOID, BOOLOID, TEXTOID };
+
+   values[0] = local_sysid;
+   values[1] = local_timeline;
+   values[2] = local_dboid;
+   values[3] = "0";
+   values[4] = "0";
+   values[5] = "0";
+   values[6] = "f";
+   values[7] = &my_conninfo[0];
+   /* TODO: replication sets too! */
+
+   res = PQexecParams(conn, "INSERT INTO bdr.bdr_connections\n"
+                            "(conn_sysid, conn_timeline, conn_dboid,\n"
+                            " conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,\n"
+                            " conn_is_unidirectional, conn_dsn)\n"
+                            "VALUES ($1,$2,$3,$4,$5,$6,$7,$8)",
+                      8, types, values, NULL, NULL, false);
+
+   if (PQresultStatus(res) != PGRES_COMMAND_OK)
+   {
+       elog(ERROR, "inserting local info into bdr_connections failed with %s: %s\n",
+            PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+   }
+
+   PQclear(res);
+}
+
+/*
+ * Utility function for upgrading a BDR node running 0.8.0 or older to 0.9.0
+ * (dynamic configuration).
+ *
+ * This function is only used for the 2nd and subsequent nodes. It is not
+ * required or useful for upgrading the first node.
+ *
+ * This does some sanity checks to ensure the local node isn't already joined
+ * and that the remote node is actually a known peer with a bdr_nodes entry.
+ *
+ * It then copies the remote end's bdr_connections entries to the local node so
+ * the local node knows which peers to connect to. It inserts a copy of the
+ * local node's bdr_connections entry in the remote and tells the local and
+ * remote nodes to refresh their worker lists.
+ *
+ * This is one long function because it's one-shot code. It's written in C
+ * so it can re-use libpq connections across multiple steps, doing everything
+ * in one transaction.
+ */
+Datum
+bdr_upgrade_to_090(PG_FUNCTION_ARGS)
+{
+   const char  *my_conninfo = PG_GETARG_CSTRING(0);
+   const char  *remote_conninfo;
+   const char  *my_local_conninfo = NULL;
+   PGconn      *local_conn = NULL;
+   const char  *local_dsn;
+
+   char        local_sysid_str[33];
+   char        local_timeline_str[33];
+   char        local_dboid_str[33];
+
+   stringify_my_node_identity(local_sysid_str, sizeof(local_sysid_str),
+                              local_timeline_str, sizeof(local_timeline_str),
+                              local_dboid_str, sizeof(local_dboid_str));
+
+   if (!PG_ARGISNULL(1))
+   {
+       my_local_conninfo = PG_GETARG_CSTRING(1);
+       local_dsn = my_local_conninfo;
+   }
+   else
+   {
+       local_dsn = my_conninfo;
+   }
+
+   if (PG_ARGISNULL(2))
+   {
+       elog(NOTICE, "upgrading the first node of a BDR group (remote_conninfo was null)");
+       remote_conninfo = NULL;
+   }
+   else
+   {
+       elog(NOTICE, "upgrading the local node by connecting to an already upgraded peer node");
+       remote_conninfo = PG_GETARG_CSTRING(2);
+   }
+
+   /*
+    * Connect to the local node in non-replication mode.
+    *
+    * We'll use this connection to COPY pg_connections data, instead of having
+    * to mess around constructing and deconstructing pg_connections tuples. It
+    * also lets us commit autonomously.
+    */
+   local_conn = PQconnectdb(local_dsn);
+
+   if (PQstatus(local_conn) != CONNECTION_OK)
+   {
+       ereport(ERROR,
+               (errmsg("connection to supplied local dsn '%s' failed", local_dsn),
+                errdetail("Connection failed with %s", PQerrorMessage(local_conn))));
+   }
+
+   PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+   {
+       PGconn *remote_conn = NULL;
+       PGresult *res;
+       remote_node_info    ri, li, li_via_remote;
+       Oid         nodeid_types[3] = { TEXTOID, OIDOID, OIDOID };
+       const char  *local_nodeid_values[3];
+
+       const char * const bdr_nodes_query =
+           "SELECT 1 FROM bdr.bdr_nodes "
+           "WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3";
+
+       const char * const setup_query =
+           "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+           "SET search_path = bdr, pg_catalog;\n"
+           "SET bdr.permit_unsafe_ddl_commands = on;\n"
+           "SET bdr.skip_ddl_replication = on;\n"
+           "SET bdr.skip_ddl_locking = on;\n"
+           "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+           "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+       local_nodeid_values[0] = &local_sysid_str[0];
+       local_nodeid_values[1] = &local_timeline_str[0];
+       local_nodeid_values[2] = &local_dboid_str[0];
+
+       res = PQexec(local_conn, setup_query);
+       if (PQresultStatus(res) != PGRES_COMMAND_OK)
+           elog(ERROR, "BEGIN or table locking on local failed: %s",
+                   PQresultErrorMessage(res));
+
+       PQclear(res);
+
+       /*
+        * Check that the local connection supplied is usable, and that the
+        * node identity of the endpoint matches the node we're being called
+        * in.
+        *
+        * This will test the local-only remote_conn if supplied, otherwise the
+        * my-dsn remote_conn. Whichever one we're using for the init process.
+        * (There's no guarantee that my-dsn is even valid from the perspective
+        * of the local node if a local_dsn was also supplied).
+        *
+        * Replication mode isn't tested here. We'll ask the peer to
+        * connect back to us later instead.
+        */
+       bdr_get_remote_nodeinfo_internal(local_conn, &li);
+
+       if (!(li.sysid == GetSystemIdentifier()
+           && li.timeline == ThisTimeLineID
+           && li.dboid == MyDatabaseId))
+       {
+           ereport(ERROR,
+                   (errmsg("local dsn %s must point to the local node", local_dsn),
+                    errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+                        GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+                        li.sysid, li.timeline, li.dboid)));
+       }
+
+       if (!li.is_superuser)
+           elog(ERROR, "local connection '%s' must have superuser rights", local_dsn);
+
+       {
+           /*
+            * Check for ourselves in local bdr_nodes by UPDATEing our local
+            * bdr_nodes entry. This will get propagated to the remote end later.
+            *
+            * These values could already be set if a prior upgrade attempt failed
+            * after a local commit and before the remote commit.
+            */
+           const char *    node_status;
+           const char *    bdr_nodes_update_values[5];
+
+           Oid             bdr_nodes_update_types[5] =
+               { TEXTOID, OIDOID, OIDOID, TEXTOID, TEXTOID };
+
+
+           bdr_nodes_update_values[0] = &local_sysid_str[0];
+           bdr_nodes_update_values[1] = &local_timeline_str[0];
+           bdr_nodes_update_values[2] = &local_dboid_str[0];
+
+           if (local_dsn != NULL)
+               bdr_nodes_update_values[3] = local_dsn;
+           else
+               bdr_nodes_update_values[3] = NULL;
+
+           if (remote_conninfo != NULL)
+               bdr_nodes_update_values[4] = remote_conninfo;
+           else
+               bdr_nodes_update_values[4] = NULL;
+
+           res = PQexecParams(local_conn,
+                              "UPDATE bdr.bdr_nodes "
+                              "SET node_local_dsn = $4, "
+                              "    node_init_from_dsn = $5 "
+                              "WHERE node_sysid = $1 "
+                              "  AND node_timeline = $2 "
+                              "  AND node_dboid = $3"
+                              "RETURNING node_status",
+                              5, bdr_nodes_update_types, bdr_nodes_update_values,
+                              NULL, NULL, 0);
+
+           if (PQresultStatus(res) != PGRES_TUPLES_OK)
+           {
+               elog(ERROR, "updating local bdr_nodes failed: state %s: %s\n",
+                    PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+           }
+
+           if (PQntuples(res) != 1)
+           {
+               ereport(ERROR,
+                       (errmsg("no entry for local node found in bdr.bdr_nodes"),
+                        errdetail("Expected (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but no such row found in bdr_nodes",
+                            GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId)));
+           }
+
+           node_status = PQgetvalue(res, 0, 0);
+
+           if (strcmp(node_status, "r") != 0)
+           {
+               ereport(ERROR,
+                       (errmsg("bdr_nodes entry for local node has status != 'r'"),
+                        errdetail("Row with (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but status = '%s' not expected 'r'",
+                            GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, node_status)));
+           }
+
+       }
+
+       /*
+        * Another sanity check: Local bdr_connections must be empty.
+        *
+        * If it isn't then a prior upgrade failed after the local commit
+        * but before the remote commit. The local bdr_connections must be
+        * deleted with replication disabled to prevent the deletion
+        * from being enqueued on the outbound slots. This is done
+        * manually by the user per the docs.
+        */
+       res = PQexec(local_conn, "SELECT 1 FROM bdr.bdr_connections");
+
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+           elog(ERROR, "querying local bdr_connections failed: state %s: %s\n",
+                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       }
+
+       if (PQntuples(res) > 0)
+       {
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("the local node's bdr.bdr_connections is not empty"),
+                    errdetail("No connections from the local node to other nodes may exist when upgrading"),
+                    errhint("If a prior upgrade attempt failed see the documentation for recovery steps")));
+       }
+
+       PQclear(res);
+
+       /*
+        * BDR requires a security label to be set on the database in order
+        * to start up.
+        */
+       res = PQexec(local_conn, "SELECT bdr.internal_update_seclabel()");
+
+       if (PQresultStatus(res) != PGRES_TUPLES_OK)
+       {
+           elog(ERROR, "setting local bdr security label failed: state %s: %s\n",
+                PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+       }
+
+       PQclear(res);
+
+
+       /*
+        * If this is the first node, insert an entry for ourselves into
+        * the local bdr_connections. We can't insert into the remote and
+        * have it replicate because there is no remote.
+        */
+       if (remote_conninfo == NULL)
+       {
+           bdr_upgrade_to_090_insert_connection(local_conn, local_sysid_str,
+                   local_timeline_str, local_dboid_str, my_conninfo);
+       }
+
+       /*
+        * Establish the connection we'll use to copy the bdr_connections
+        * entries we need and insert our own bdr_connections entry
+        * into the remote end.
+        */
+       if (remote_conninfo != NULL)
+       {
+           StringInfoData  dsn;
+
+           initStringInfo(&dsn);
+           appendStringInfo(&dsn,
+                           "%s fallback_application_name='"BDR_LOCALID_FORMAT":init'",
+                           remote_conninfo, BDR_LOCALID_FORMAT_ARGS);
+           /*
+            * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
+            * system identifier. If there is, that'll tell us what stage of startup
+            * we are up to and let us resume an incomplete start.
+            */
+           remote_conn = PQconnectdb(dsn.data);
+           if (PQstatus(remote_conn) != CONNECTION_OK)
+           {
+               ereport(FATAL,
+                       (errmsg("could not connect to the server in non-replication mode: %s",
+                               PQerrorMessage(remote_conn)),
+                        errdetail("dsn was: %s", dsn.data)));
+           }
+       }
+
+       PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&remote_conn));
+       {
+
+           char        remote_sysid_str[33];
+           char        remote_timeline_str[33];
+           char        remote_dboid_str[33];
+           const char *remote_nodeid_values[3];
+
+           if (remote_conn != NULL)
+           {
+               res = PQexec(remote_conn, setup_query);
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "BEGIN or table locking on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               /*
+                * Obtain the remote node's identity so we can look it up in the local
+                * bdr_nodes and see if we recognise this node. This will also ensure
+                * BDR is installed on the remote.
+                */
+               bdr_get_remote_nodeinfo_internal(remote_conn, &ri);
+
+               if (ri.sysid == GetSystemIdentifier()
+                   && ri.timeline == ThisTimeLineID
+                   && ri.dboid == MyDatabaseId)
+               {
+                   bdr_error_nodeids_must_differ(ri.sysid, ri.timeline, ri.dboid);
+               }
+
+               if (ri.version_num != BDR_VERSION_NUM)
+                   elog(ERROR, "remote end must run BDR version %s but is running %s",
+                        BDR_VERSION, ri.version);
+
+               if (!ri.is_superuser)
+                   elog(ERROR, "connection must have superuser rights");
+
+               if (strcmp(ri.variant, "BDR") != 0)
+                   elog(ERROR, "remote node must be running full BDR, not variant %s",
+                           ri.variant);
+
+               /*
+                * As a further sanity check, make sure the remote node can connect back
+                * to the local node, and that the resulting IDs match.
+                */
+               bdr_test_remote_connectback_internal(remote_conn, &li_via_remote, my_conninfo);
+
+               if (!(li_via_remote.sysid == GetSystemIdentifier()
+                   && li_via_remote.timeline == ThisTimeLineID
+                   && li_via_remote.dboid == MyDatabaseId))
+               {
+                   ereport(ERROR,
+                           (errmsg("remote node can connect to dsn %s but it doesn't match the local node identity", my_conninfo),
+                            errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+                                GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+                                li_via_remote.sysid, li_via_remote.timeline, li_via_remote.dboid)));
+               }
+
+               if (!li_via_remote.is_superuser)
+                   elog(ERROR, "connection from remote node to local node using dsn '%s' must have superuser rights", my_conninfo);
+
+               /*
+                * The basics look sane. Check to see if the target node is present
+                * in the local bdr_nodes. If it isn't then we can't join it with
+                * an upgrade, because it's not an existing peer.
+                */
+
+               stringify_node_identity(remote_sysid_str, sizeof(remote_sysid_str),
+                                       remote_timeline_str, sizeof(remote_timeline_str),
+                                       remote_dboid_str, sizeof(remote_dboid_str),
+                                       ri.sysid, ri.timeline, ri.dboid);
+
+               remote_nodeid_values[0] = &remote_sysid_str[0];
+               remote_nodeid_values[1] = &remote_timeline_str[0];
+               remote_nodeid_values[2] = &remote_dboid_str[0];
+
+               res = PQexecParams(local_conn, bdr_nodes_query, 3, nodeid_types, remote_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying local bdr_nodes for remote nodeid failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) == 0)
+               {
+                   /* Looks like we didn't find the expected node entry */
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The remote node identified by the passed remote connection string is not known locally"),
+                            errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the local bdr.bdr_nodes",
+                                ri.sysid, ri.timeline, ri.dboid),
+                            errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+               }
+
+               Assert(PQntuples(res) == 1);
+
+               PQclear(res);
+
+               /*
+                * Now ensure that our node is known to the remote end
+                */
+               res = PQexecParams(remote_conn, bdr_nodes_query, 3, nodeid_types,
+                                  local_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_nodes for local nodeid failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) == 0)
+               {
+                   /*
+                    * We're not known to the remote node so we can't do an upgrade
+                    * join to it.
+                    */
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string does not recognise the local node"),
+                            errdetail("The local node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the remote bdr.bdr_nodes",
+                                GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId),
+                            errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+               }
+
+               Assert(PQntuples(res) == 1);
+
+               PQclear(res);
+
+               /*
+                * We now know there's a bdr_nodes entry on each end. Ensure that the
+                * remote end contains at least a bdr_connections entry for its self
+                * and does NOT contain a connection for us.
+                */
+               res = PQexec(remote_conn,
+                            "SELECT 1 "
+                            "FROM bdr.bdr_connections c, "
+                            "     bdr.bdr_get_local_nodeid() l "
+                            "WHERE c.conn_sysid = l.sysid "
+                            "  AND c.conn_timeline = l.timeline "
+                            "  AND c.conn_dboid = l.dboid "
+                            );
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) != 1)
+               {
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string does not yet have a connection entry for its own node"),
+                            errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (conn_sysid,conn_timeline,conn_dboid) is present in the local bdr.bdr_connections",
+                                ri.sysid, ri.timeline, ri.dboid),
+                            errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+               }
+
+               PQclear(res);
+
+               res = PQexecParams(remote_conn,
+                                  "SELECT 1 "
+                                  "FROM bdr.bdr_connections c "
+                                  "WHERE c.conn_sysid = $1 "
+                                  "  AND c.conn_timeline = $2 "
+                                  "  AND c.conn_dboid = $3 ",
+                                  3, nodeid_types, local_nodeid_values, NULL, NULL, 0);
+
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               {
+                   elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+                        PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+               }
+
+               if (PQntuples(res) != 0)
+               {
+                   ereport(ERROR,
+                           (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                            errmsg("The node identified by the passed connection string already has a connection string for the local node"),
+                            errdetail("The local node's identity (conn_sysid="UINT64_FORMAT",conn_timeline=%u,conn_dboid=%u) already has an entry in the remote bdr.bdr_connections",
+                                li.sysid, li.timeline, ri.dboid),
+                            errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+               }
+
+               PQclear(res);
+
+               /*
+                * Alright, time to actually perform the upgrade.
+                *
+                * We need to:
+                *
+                * - Copy remote bdr_connections entries to the local node
+                *
+                * - Upsert a row for the local node in the remote's
+                *   bdr_connections
+                *
+                * - Register an on commit hook on the remote to rescan
+                *   bdr_connections.
+                *
+                * - Register an on commit hook on the local side to rescan
+                *   bdr_connections
+                *
+                * - set the local security label
+                *
+                * - Commit the remote transaction, adding the bdr_connections
+                *   row
+                *
+                * - Return, allowing a commit to occur to save the local
+                *   bdr_connections entries.
+                */
+
+               bdr_copytable(remote_conn, local_conn,
+                       "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+                       "COPY bdr.bdr_connections FROM stdin");
+
+               /*
+                * Time to insert connection info about us into the remote node and ask it
+                * to connect back to us, then tell the other nodes. We don't update
+                * the remote's bdr_nodes entry for us, as the change we applied locally
+                * will get replicated.
+                *
+                * Since we have a remote conn we didn't insert our
+                * bdr_connections entry locally above. Insert it into the
+                * remote node now instead. It'll replicate back to the local
+                * node when we connect to the upstream.
+                */
+               bdr_upgrade_to_090_insert_connection(remote_conn, local_sysid_str,
+                       local_timeline_str, local_dboid_str, my_conninfo);
+
+               res = PQexec(remote_conn, "SELECT bdr.bdr_connections_changed()");
+               if (PQresultStatus(res) != PGRES_TUPLES_OK)
+                   elog(ERROR, "SELECT bdr.bdr_connections_changed() on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               res = PQexec(remote_conn, "INSERT INTO bdr.bdr_queued_commands\n"
+                                         "(lsn, queued_at, perpetrator, command_tag, command)\n"
+                                         "VALUES (pg_current_xlog_insert_location(), current_timestamp,\n"
+                                         "        current_user, 'SELECT',\n"
+                                         "       'SELECT bdr.bdr_connections_changed()');");
+
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "enqueuing bdr.bdr_connections_changed() in the ddl rep queue failed: %s",
+                           PQresultErrorMessage(res));
+           }
+
+           res = PQexec(local_conn, "SELECT bdr.bdr_connections_changed()");
+           if (PQresultStatus(res) != PGRES_TUPLES_OK)
+               elog(ERROR, "SELECT bdr.bdr_connections_changed() on local failed: %s",
+                       PQresultErrorMessage(res));
+
+           PQclear(res);
+
+           res = PQexec(local_conn, "COMMIT");
+           if (PQresultStatus(res) != PGRES_COMMAND_OK)
+               elog(ERROR, "COMMIT on remote failed: %s",
+                       PQresultErrorMessage(res));
+
+           PQclear(res);
+
+           if (remote_conn != NULL)
+           {
+               res = PQexec(remote_conn, "COMMIT");
+               if (PQresultStatus(res) != PGRES_COMMAND_OK)
+                   elog(ERROR, "COMMIT on remote failed: %s",
+                           PQresultErrorMessage(res));
+
+               PQclear(res);
+
+               free_remote_node_info(&ri);
+           }
+
+           free_remote_node_info(&li);
+       }
+       PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                               PointerGetDatum(&remote_conn));
+
+       PQfinish(remote_conn);
+
+   }
+   PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+                           PointerGetDatum(&local_conn));
+
+   PQfinish(local_conn);
+
+   PG_RETURN_VOID();
+}
index b574fbd9ada520fc51e39b4baeaf14e6543089f5..6aa5d912c6ce3891d72be63bbff0477361701bc0 100644 (file)
@@ -774,6 +774,13 @@ ERROR:  Tables WITH OIDs are not supported with bdr
 CREATE TABLE tbl_without_oids() WITHOUT oids;
 DROP TABLE tbl_without_oids;
 SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
+ pg_xlog_wait_remote_apply 
+---------------------------
+(2 rows)
+
 --- AGGREGATE ---
 \c postgres
 CREATE AGGREGATE test_avg (
index 4dd05289da9cfbca97ca7c4cd8d194b73bef096e..0e0cd804c62e3b8249c0ccd6e6d55d095a7d975a 100644 (file)
@@ -9,45 +9,9 @@ CREATE USER super SUPERUSER;
 GRANT ALL ON SCHEMA public TO nonsuper;
 \c regression
 GRANT ALL ON SCHEMA public TO nonsuper;
-SELECT pg_sleep(10);
- pg_sleep 
-----------
-(1 row)
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
-   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
-   IF FOUND THEN
-       RETURN;
-   END IF;
-
-   PERFORM bdr.bdr_replicate_ddl_command($DDL$
-       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
-       AS $FUNC$
-       BEGIN
-           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
-               PERFORM pg_sleep(0.01);
-           END LOOP;
-       END;$FUNC$ LANGUAGE plpgsql;
-   $DDL$);
-END;$DO$;
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
-    OUT readdb1 text,
-    OUT readdb2 text,
-    OUT writedb1 text,
-    OUT writedb2 text
-    ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
-    current_setting('bdrtest.readdb1'),
-    current_setting('bdrtest.readdb2'),
-    current_setting('bdrtest.writedb1'),
-    current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
- bdr_replicate_ddl_command 
----------------------------
-(1 row)
-
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
diff --git a/expected/init_bdr.out b/expected/init_bdr.out
new file mode 100644 (file)
index 0000000..9b1dbe4
--- /dev/null
@@ -0,0 +1,97 @@
+\c postgres
+SELECT bdr.bdr_group_create(
+   dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-1']
+   );
+ bdr_group_create 
+------------------
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+(1 row)
+
+\c regression
+SELECT bdr.bdr_group_join(
+   dsn := 'dbname=regression',
+   init_from_dsn := 'dbname=postgres',
+   local_dsn := 'dbname=regression',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+ bdr_group_join 
+----------------
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+(1 row)
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type |  database  | active 
+--------+-----------+------------+--------
+ bdr    | logical   | postgres   | t
+ bdr    | logical   | regression | t
+(2 rows)
+
+SELECT count(*) FROM pg_stat_replication;
+ count 
+-------
+     2
+(1 row)
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres   | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status |  node_local_dsn   | node_init_from_dsn 
+-------------+-------------------+--------------------
+ r           | dbname=postgres   | 
+ r           | dbname=regression | dbname=postgres
+(2 rows)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres   | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status |  node_local_dsn   | node_init_from_dsn 
+-------------+-------------------+--------------------
+ r           | dbname=postgres   | 
+ r           | dbname=regression | dbname=postgres
+(2 rows)
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command 
+---------------------------
+(1 row)
+
diff --git a/expected/init_udr.out b/expected/init_udr.out
new file mode 100644 (file)
index 0000000..6740cb2
--- /dev/null
@@ -0,0 +1,89 @@
+\c postgres
+SELECT bdr.bdr_subscribe(
+   remote_dsn := 'dbname=regression',
+   local_dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+ bdr_subscribe 
+---------------
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready 
+------------------------------
+(1 row)
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type |  database  | active 
+--------+-----------+------------+--------
+ bdr    | logical   | regression | t
+(1 row)
+
+SELECT count(*) FROM pg_stat_replication;
+ count 
+-------
+     1
+(1 row)
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+     conn_dsn      |                                conn_replication_sets                                 
+-------------------+--------------------------------------------------------------------------------------
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(1 row)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn  | node_init_from_dsn 
+-------------+-----------------+--------------------
+ r           | dbname=postgres | dbname=regression
+(1 row)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+ conn_dsn | conn_replication_sets 
+----------+-----------------------
+(0 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn | node_init_from_dsn 
+-------------+----------------+--------------------
+(0 rows)
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+   IF FOUND THEN
+       RETURN;
+   END IF;
+
+   PERFORM bdr.bdr_replicate_ddl_command($DDL$
+       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+       AS $FUNC$
+       BEGIN
+           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+               PERFORM pg_sleep(0.01);
+           END LOOP;
+       END;$FUNC$ LANGUAGE plpgsql;
+   $DDL$);
+END;$DO$;
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command 
+---------------------------
+(1 row)
+
diff --git a/expected/isolation/init.out b/expected/isolation/init.out
new file mode 100644 (file)
index 0000000..aa62f39
--- /dev/null
@@ -0,0 +1,137 @@
+Parsed test spec with 3 sessions
+
+starting permutation: setup1 setup2 setup3 join_root join_2 wait_join_2 check_join_2 join_3 wait_join_3 check_join_3 wait
+step setup1: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step setup2: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step setup3: 
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+
+step join_root: 
+   SELECT bdr.bdr_group_create(
+       dsn := 'dbname=node1'
+       );
+
+bdr_group_create
+
+               
+step join_2: 
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node2',
+       init_from_dsn := 'dbname=node1'
+       );
+
+bdr_group_join 
+
+               
+step wait_join_2: 
+   SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+               
+step check_join_2: 
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+               
+plugin         slot_type      database       active         
+
+bdr            logical        node1          t              
+bdr            logical        node2          t              
+count          
+
+2              
+conn_dsn       conn_replication_sets
+
+dbname=node1   {default}      
+dbname=node2   {default}      
+node_status    node_local_dsn node_init_from_dsn
+
+r              dbname=node1                  
+r              dbname=node2   dbname=node1   
+step join_3: 
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node3',
+       init_from_dsn := 'dbname=node1',
+       local_dsn := 'dbname=node3'
+       );
+
+bdr_group_join 
+
+               
+step wait_join_3: 
+   SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+               
+step check_join_3: 
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+               
+plugin         slot_type      database       active         
+
+bdr            logical        node1          t              
+bdr            logical        node1          t              
+bdr            logical        node2          t              
+bdr            logical        node2          t              
+bdr            logical        node3          t              
+bdr            logical        node3          t              
+count          
+
+6              
+conn_dsn       conn_replication_sets
+
+dbname=node1   {default}      
+dbname=node2   {default}      
+dbname=node3   {default}      
+node_status    node_local_dsn node_init_from_dsn
+
+r              dbname=node1                  
+r              dbname=node2   dbname=node1   
+r              dbname=node3   dbname=node1   
+step wait: 
+   -- pg_xlog_wait_remote_apply isn't good enough alone
+   -- as it doesn't permit us to say how many nodes must be present.
+   -- It'll succeed if there are zero nodes. So we first have to wait
+   -- for enough replication connections.
+   DO $$
+   DECLARE
+       nodecount integer := 0;
+       target_lsn pg_lsn;
+   BEGIN
+       WHILE nodecount <> 6
+       LOOP
+           PERFORM pg_sleep(1);
+           PERFORM pg_stat_clear_snapshot();
+           -- Now find out how many walsenders are running
+           nodecount := (SELECT count(*)
+                         FROM pg_catalog.pg_stat_replication);
+           RAISE NOTICE 'Found % nodes',nodecount;
+       END LOOP;
+       -- OK, all nodes seen, now we wait for catchup on them all.
+       target_lsn := pg_current_xlog_location();
+       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+       RAISE NOTICE 'Catchup to LSN completed';
+   END;
+   $$;
+
diff --git a/expected/isolation/waitforstart.out b/expected/isolation/waitforstart.out
deleted file mode 100644 (file)
index 8fcfe02..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-Parsed test spec with 1 sessions
-
-starting permutation: wait
-step wait: 
-   DO $$
-   DECLARE
-       nodecount integer := 0;
-       target_lsn pg_lsn;
-   BEGIN
-       WHILE nodecount <> 6
-       LOOP
-           PERFORM pg_sleep(1);
-           PERFORM pg_stat_clear_snapshot();
-           -- Now find out how many walsenders are running
-           nodecount := (SELECT count(*)
-                         FROM pg_catalog.pg_stat_replication);
-           RAISE NOTICE 'Found % nodes',nodecount;
-       END LOOP;
-       -- OK, all nodes seen, now we wait for catchup on them all.
-       target_lsn := pg_current_xlog_location();
-       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
-       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
-       RAISE NOTICE 'Catchup to LSN completed';
-   END;
-   $$;
-
index 1478090345dc7910010daf37da5723ba67c46ee4..7f1fb1a35fab4d3d33a606ee5dc0f4ac96950da6 100644 (file)
@@ -25,6 +25,8 @@ CREATE EXTENSION bdr VERSION '0.8.0.7';
 DROP EXTENSION bdr;
 CREATE EXTENSION bdr VERSION '0.9.0.0';
 DROP EXTENSION bdr;
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
 -- evolve version one by one from the oldest to the newest one
 CREATE EXTENSION bdr VERSION '0.8.0';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
@@ -35,8 +37,9 @@ ALTER EXTENSION bdr UPDATE TO '0.8.0.5';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
 ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
 -- Should never have to do anything: You missed adding the new version above.
 ALTER EXTENSION bdr UPDATE;
-NOTICE:  version "0.9.0.0" of extension "bdr" is already installed
+NOTICE:  version "0.9.0.1" of extension "bdr" is already installed
 \c postgres
 DROP DATABASE extension_upgrade;
index 0560a8ed16ccef588486bfb99903f3dc176ff9f1..6880da348bcc818d463bc3e015a62a66d2e02606 100644 (file)
@@ -359,6 +359,9 @@ COMMENT ON COLUMN bdr_nodes.node_timeline IS 'timeline ID of this node';
 COMMENT ON COLUMN bdr_nodes.node_dboid IS 'local database oid on the cluster (node_sysid, node_timeline)';
 COMMENT ON COLUMN bdr_nodes.node_status IS 'Readiness of the node: [i]nitializing, [c]atchup, [r]eady. Doesn''t indicate connected/disconnected.';
 
+-- We don't exclude bdr_nodes with pg_extension_config_dump
+-- because this is a global table that's sync'd between nodes.
+
 CREATE TABLE bdr_global_locks(
     locktype text NOT NULL,
 
diff --git a/extsql/bdr--0.9.0.0--0.9.0.1.sql b/extsql/bdr--0.9.0.0--0.9.0.1.sql
new file mode 100644 (file)
index 0000000..98d9ef5
--- /dev/null
@@ -0,0 +1,584 @@
+-- Data structures for BDR's dynamic configuration management
+
+SET LOCAL search_path = bdr;
+SET bdr.permit_unsafe_ddl_commands = true;
+SET bdr.skip_ddl_replication = true;
+
+ALTER TABLE bdr.bdr_nodes
+  ADD COLUMN node_local_dsn text,
+  ADD COLUMN node_init_from_dsn text;
+
+ALTER TABLE bdr.bdr_nodes
+  DROP CONSTRAINT bdr_nodes_node_status_check;
+
+ALTER TABLE bdr.bdr_nodes
+  ADD CONSTRAINT bdr_nodes_node_status_check
+    CHECK (node_status in ('b', 'i', 'c', 'o', 'r'));
+
+CREATE TABLE bdr_connections (
+    conn_sysid text not null,
+    conn_timeline oid not null,
+    conn_dboid oid not null,  -- This is an oid local to the node_sysid cluster
+
+    -- Wondering why there's no FOREIGN KEY to bdr.bdr_nodes?
+    -- bdr.bdr_nodes won't be populated when the bdr.bdr_connections
+    -- row gets created on the local node.
+
+    -- These fields may later be used by BDR to override connection
+    -- settings from one node to a particular other node. At the
+    -- moment their main use is for UDR connections, where we must
+    -- ensure that the connection is only made from one particular
+    -- node.
+    conn_origin_sysid text,
+    conn_origin_timeline oid,
+    conn_origin_dboid oid,
+
+    PRIMARY KEY(conn_sysid, conn_timeline, conn_dboid,
+                conn_origin_sysid, conn_origin_timeline, conn_origin_dboid),
+
+    -- Either a whole origin ID (for an override or UDR entry) or no
+    -- origin ID may be provided.
+    CONSTRAINT origin_all_or_none_null
+        CHECK ((conn_origin_sysid = '0') = (conn_origin_timeline = 0)
+           AND (conn_origin_sysid = '0') = (conn_origin_dboid = 0)),
+
+    -- Indicates that this connection is unidirectional; there won't be
+    -- a corresponding inbound connection from the peer node. Only permitted
+    -- where the conn_origin fields are set.
+    conn_is_unidirectional boolean not null default false,
+
+    CONSTRAINT unidirectional_conn_must_have_origin
+        CHECK ((NOT conn_is_unidirectional) OR (conn_origin_sysid <> '0')),
+
+    conn_dsn text not null,
+
+    conn_apply_delay integer
+        CHECK (conn_apply_delay >= 0),
+
+    conn_replication_sets text[]
+);
+
+REVOKE ALL ON TABLE bdr_connections FROM public;
+
+COMMENT ON TABLE bdr_connections IS 'Connection information for nodes in the group. Don''t modify this directly, use the provided functions. One entry should exist per node in the group.';
+
+COMMENT ON COLUMN bdr_connections.conn_sysid IS 'System identifer for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_timeline IS 'System timeline ID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_dboid IS 'System database OID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_origin_sysid IS 'If set, ignore this entry unless the local sysid is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_timeline IS 'If set, ignore this entry unless the local timeline is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_dboid IS 'If set, ignore this entry unless the local dboid is this';
+COMMENT ON COLUMN bdr_connections.conn_dsn IS 'A libpq-style connection string specifying how to make a connection to this node from other nodes.';
+COMMENT ON COLUMN bdr_connections.conn_apply_delay IS 'If set, milliseconds to wait before applying each transaction from the remote node. Mainly for debugging. If null, the global default applies.';
+COMMENT ON COLUMN bdr_connections.conn_replication_sets IS 'Replication sets this connection should participate in, if non-default.';
+
+SELECT pg_catalog.pg_extension_config_dump('bdr_connections', '');
+
+CREATE FUNCTION bdr_connections_changed()
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_connections_changed() FROM public;
+
+COMMENT ON FUNCTION bdr_connections_changed() IS 'Internal BDR function, do not call directly.';
+
+
+--
+-- This is a helper for node_join, for internal use only. It's called
+-- on the remote end by the init code when joining an existing group,
+-- to do the remote-side setup.
+--
+CREATE FUNCTION bdr.internal_node_join(
+    sysid text, timeline oid, dboid oid,
+    dsn text,
+    apply_delay integer,
+    replication_sets text[]
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+AS
+$body$
+DECLARE
+    status "char";
+BEGIN
+    LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+    LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+    IF bdr_variant() <> 'BDR' THEN
+        RAISE USING
+            MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+            DETAIL = 'The target node is running something other than full BDR so you cannot join a BDR node to it',
+            HINT = 'Install full BDR if possible or use the UDR functions.',
+            ERRCODE = 'feature_not_supported';
+    END IF;
+
+    -- Assert that we have a bdr_nodes entry with state = i on this node
+    SELECT INTO status
+    FROM bdr.bdr_nodes
+    WHERE node_sysid = sysid
+      AND node_timeline = timeline
+      AND node_dboid = dboid;
+
+    IF NOT FOUND THEN
+        RAISE object_not_in_prerequisite_state
+              USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) not found',
+                                     sysid, timeline, dboid);
+    END IF;
+
+    IF status <> 'i' THEN
+        RAISE object_not_in_prerequisite_state
+              USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) has unexpected status %L (expected ''i'')',
+                                     sysid, timeline, dboid, status);
+    END IF;
+
+    -- Insert or Update the connection info on this node, which we must be
+    -- initing from.
+    -- No need to care about concurrency here as we hold EXCLUSIVE LOCK.
+    BEGIN
+        INSERT INTO bdr.bdr_connections
+        (conn_sysid, conn_timeline, conn_dboid,
+         conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+         conn_dsn,
+         conn_apply_delay, conn_replication_sets,
+         conn_is_unidirectional)
+        VALUES
+        (sysid, timeline, dboid,
+         '0', 0, 0,
+         dsn,
+         CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+         replication_sets, false);
+    EXCEPTION WHEN unique_violation THEN
+        UPDATE bdr.bdr_connections
+        SET conn_dsn = dsn,
+            conn_apply_delay = CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+            conn_replication_sets = replication_sets,
+            conn_is_unidirectional = false
+        WHERE conn_sysid = sysid
+          AND conn_timeline = timeline
+          AND conn_dboid = dboid
+          AND conn_origin_sysid = '0'
+          AND conn_origin_timeline = 0
+          AND conn_origin_dboid = 0;
+    END;
+
+    -- Schedule the apply worker launch for commit time
+    PERFORM bdr.bdr_connections_changed();
+
+    -- and ensure the apply worker is launched on other nodes
+    -- when this transaction replicates there, too.
+    INSERT INTO bdr.bdr_queued_commands
+    (lsn, queued_at, perpetrator, command_tag, command)
+    VALUES
+    (pg_current_xlog_insert_location(), current_timestamp, current_user,
+    'SELECT', 'SELECT bdr.bdr_connections_changed()');
+END;
+$body$;
+
+
+CREATE FUNCTION bdr.internal_update_seclabel()
+RETURNS void LANGUAGE plpgsql
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    v_label json;
+BEGIN
+    -- Update 'bdr' parameter in the current label if there's one.
+    -- (Right now there's not much point to this but later we'll be
+    -- possibly have more information in there.)
+
+    -- first select existing label
+    SELECT label::json INTO v_label
+    FROM pg_catalog.pg_shseclabel
+    WHERE provider = 'bdr'
+      AND classoid = 'pg_database'::regclass
+      AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+    -- then replace 'bdr' with 'bdr'::true
+    SELECT json_object_agg(key, value) INTO v_label
+    FROM (
+        SELECT key, value
+        FROM json_each(v_label)
+        WHERE key <> 'bdr'
+      UNION ALL
+        SELECT 'bdr', to_json(true)
+    ) d;
+
+    -- and set the newly computed label
+    -- (It's safe to do this early, it won't take effect
+    -- until commit)
+    EXECUTE format('SECURITY LABEL FOR bdr ON DATABASE %I IS %L',
+                   current_database(), v_label);
+END;
+$body$;
+
+-- Setup that's common to BDR and UDR joins
+CREATE FUNCTION bdr.internal_begin_join(caller text, local_dsn text, remote_dsn text,
+    remote_sysid OUT text, remote_timeline OUT oid, remote_dboid OUT oid
+)
+RETURNS record LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid RECORD;
+    localid_from_dsn RECORD;
+    remote_nodeinfo RECORD;
+BEGIN
+    -- Only one tx can be adding connections
+    LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+    LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;
+    LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    -- If there's already an entry for ourselves in bdr.bdr_connections
+    -- then we know this node is part of an active BDR group and cannot
+    -- be joined to another group. Unidirectional connections are ignored.
+    PERFORM 1 FROM bdr_connections
+    WHERE conn_sysid = localid.sysid
+      AND conn_timeline = localid.timeline
+      AND conn_dboid = localid.dboid
+      AND (conn_origin_sysid = '0'
+           AND conn_origin_timeline = 0
+           AND conn_origin_dboid = 0)
+      AND conn_is_unidirectional = 'f';
+
+    IF FOUND THEN
+        RAISE USING
+            MESSAGE = 'This node is already a member of a BDR group',
+            HINT = 'Connect to the node you wish to add and run '||caller||' from it instead',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Validate that the local connection is usable and matches
+    -- the node identity of the node we're running on.
+    --
+    -- For BDR this will NOT check the 'dsn' if 'local_dsn'
+    -- gets supplied. We don't know if 'dsn' is even valid
+    -- for loopback connections and can't assume it is. That'll
+    -- get checked later by BDR specific code.
+    SELECT * INTO localid_from_dsn
+    FROM bdr_get_remote_nodeinfo(local_dsn);
+
+    IF localid_from_dsn.sysid <> localid.sysid
+        OR localid_from_dsn.timeline <> localid.timeline
+        OR localid_from_dsn.dboid <> localid.dboid
+    THEN
+        RAISE USING
+            MESSAGE = 'node identity for local dsn does not match current node',
+            DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+                local_dsn, localid_from_dsn.sysid, localid_from_dsn.timeline,
+                localid_from_dsn.dboid, localid.sysid, localid.timeline, localid.dboid),
+            HINT = 'The local_dsn (or, for bdr, dsn if local_dsn is null) parameter must refer to the node you''re running this function from',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    IF NOT localid_from_dsn.is_superuser THEN
+        RAISE USING
+            MESSAGE = 'local dsn does not have superuser rights',
+            DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, local_dsn),
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Now interrogate the remote node, if specified, and sanity
+    -- check its connection too. The discovered node identity is
+    -- returned if found.
+    --
+    -- This will error out if there are issues with the remote
+    -- node.
+    IF remote_dsn IS NOT NULL THEN
+        SELECT * INTO remote_nodeinfo
+        FROM bdr_get_remote_nodeinfo(remote_dsn);
+
+        remote_sysid := remote_nodeinfo.sysid;
+        remote_timeline := remote_nodeinfo.timeline;
+        remote_dboid := remote_nodeinfo.dboid;
+
+        IF NOT remote_nodeinfo.is_superuser THEN
+            RAISE USING
+                MESSAGE = 'connection to remote node does not have superuser rights',
+                DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, remote_dsn),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF remote_nodeinfo.version_num < bdr_min_remote_version_num() THEN
+            RAISE USING
+                MESSAGE = 'remote node''s BDR version is too old',
+                DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s is less than the required version %s$$,
+                    remote_dsn, remote_nodeinfo.version_num, bdr_min_remote_version_num()),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF remote_nodeinfo.min_remote_version_num > bdr_version_num() THEN
+            RAISE USING
+                MESSAGE = 'remote node''s BDR version is too new or this node''s version is too old',
+                DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s requires this node to run at least bdr %s, not the current %s$$,
+                    remote_dsn, remote_nodeinfo.version_num, remote_nodeinfo.min_remote_version_num,
+                    bdr_min_remote_version_num()),
+                ERRCODE = 'object_not_in_prerequisite_state';
+
+        END IF;
+
+    END IF;
+
+    -- Create local node record if needed
+    PERFORM 1 FROM bdr_nodes
+    WHERE node_sysid = localid.sysid
+      AND node_timeline = localid.timeline
+      AND node_dboid = localid.dboid;
+
+    IF NOT FOUND THEN
+        INSERT INTO bdr_nodes (
+            node_sysid, node_timeline, node_dboid,
+            node_status, node_local_dsn, node_init_from_dsn
+        ) VALUES (
+            localid.sysid, localid.timeline, localid.dboid,
+            'b', local_dsn, remote_dsn
+        );
+    END IF;
+
+    PERFORM bdr.internal_update_seclabel();
+END;
+$body$;
+
+--
+-- The public interface for node join/addition, to be run to join a currently
+-- unconnected node with a blank database to a BDR group.
+--
+CREATE FUNCTION bdr.bdr_group_join(
+    dsn text,
+    init_from_dsn text,
+    local_dsn text DEFAULT NULL,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid record;
+    connectback_nodeinfo record;
+    remoteinfo record;
+BEGIN
+    IF dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'dsn may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    IF bdr_variant() <> 'BDR' THEN
+        RAISE USING
+            MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+            DETAIL = 'The local node is not running full BDR, which is required to use bdr_join',
+            HINT = 'Install full BDR if possible or use the UDR functions.',
+            ERRCODE = 'feature_not_supported';
+    END IF;
+
+    PERFORM bdr.internal_begin_join(
+        'bdr_group_join',
+        CASE WHEN local_dsn IS NULL THEN dsn ELSE local_dsn END,
+        init_from_dsn);
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    -- Request additional connection tests to determine that the remote is
+    -- reachable for replication and non-replication mode and that the remote
+    -- can connect back to us via 'dsn' on non-replication and replication
+    -- modes.
+    --
+    -- This cannot be checked for the first node since there's no peer
+    -- to ask for help.
+    IF init_from_dsn IS NOT NULL THEN
+
+        SELECT * INTO connectback_nodeinfo
+        FROM bdr.bdr_test_remote_connectback(init_from_dsn, dsn);
+
+        -- The connectback must actually match our local node identity
+        -- and must provide a superuser connection.
+        IF NOT connectback_nodeinfo.is_superuser THEN
+            RAISE USING
+                MESSAGE = 'dsn does not have superuser rights when connecting via remote node',
+                DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, dsn),
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+
+        IF connectback_nodeinfo.sysid <> localid.sysid
+           OR connectback_nodeinfo.timeline <> localid.timeline
+           OR connectback_nodeinfo.dboid <> localid.dboid
+        THEN
+            RAISE USING
+                MESSAGE = 'node identity for dsn does not match current node when connecting back via remote',
+                DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+                    local_dsn, connectback_nodeinfo.sysid, connectback_nodeinfo.timeline,
+                    connectback_nodeinfo.dboid, localid.sysid, localid.timeline, localid.dboid),
+                HINT = 'The ''dsn'' parameter must refer to the node you''re running this function from, from the perspective of the node pointed to by init_from_dsn',
+                ERRCODE = 'object_not_in_prerequisite_state';
+        END IF;
+    END IF;
+
+    -- Null/empty checks are skipped, the underlying constraints on the table
+    -- will catch that for us.
+    INSERT INTO bdr.bdr_connections (
+        conn_sysid, conn_timeline, conn_dboid,
+        conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+        conn_dsn, conn_apply_delay, conn_replication_sets,
+        conn_is_unidirectional
+    ) VALUES (
+        localid.sysid, localid.timeline, localid.dboid,
+        '0', 0, 0,
+        dsn, apply_delay, replication_sets, false
+    );
+
+    -- Now ensure the per-db worker is started if it's not already running.
+    -- This won't actually take effect until commit time, it just adds a commit
+    -- hook to start the worker when we commit.
+    PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_join(text,text,text,integer,text[])
+IS 'Join an existing BDR group by connecting to a member node and copying its contents';
+
+CREATE FUNCTION bdr.bdr_group_create(
+    dsn text,
+    local_dsn text DEFAULT NULL,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+BEGIN
+    PERFORM bdr.bdr_group_join(
+        dsn, init_from_dsn := null, local_dsn := local_dsn,
+        apply_delay := apply_delay,
+        replication_sets := replication_sets);
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_create(text,text,integer,text[])
+IS 'Create a BDR group, turning a stand-alone database into the first node in a BDR group';
+
+--
+-- The public interface for unidirectional replication setup.
+--
+CREATE FUNCTION bdr.bdr_subscribe(
+    remote_dsn text,
+    local_dsn text,
+    apply_delay integer DEFAULT NULL,
+    replication_sets text[] DEFAULT ARRAY['default']
+    )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+    localid record;
+    remoteid record;
+BEGIN
+    IF local_dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'local_dsn may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    IF remote_dsn IS NULL THEN
+        RAISE USING
+            MESSAGE = 'remote may not be null',
+            ERRCODE = 'invalid_parameter_value';
+    END IF;
+
+    SELECT remote_sysid AS sysid, remote_timeline AS timeline,
+           remote_dboid AS dboid INTO remoteid
+    FROM bdr.internal_begin_join('bdr_subscribe', local_dsn, remote_dsn);
+
+    SELECT sysid, timeline, dboid INTO localid
+    FROM bdr.bdr_get_local_nodeid();
+
+    PERFORM 1 FROM bdr_connections
+    WHERE conn_sysid = remoteid.sysid
+      AND conn_timeline = remoteid.timeline
+      AND conn_dboid = remoteid.dboid
+      AND conn_origin_sysid = localid.sysid
+      AND conn_origin_timeline = localid.timeline
+      AND conn_origin_dboid = localid.dboid
+      AND conn_is_unidirectional = 't';
+
+    IF FOUND THEN
+        RAISE USING
+            MESSAGE = 'This node is already connected to given remote node',
+            ERRCODE = 'object_not_in_prerequisite_state';
+    END IF;
+
+    -- Null/empty checks are skipped, the underlying constraints on the table
+    -- will catch that for us.
+    INSERT INTO bdr.bdr_connections (
+        conn_sysid, conn_timeline, conn_dboid,
+        conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+        conn_dsn, conn_apply_delay, conn_replication_sets,
+        conn_is_unidirectional
+    ) VALUES (
+        remoteid.sysid, remoteid.timeline, remoteid.dboid,
+        localid.sysid, localid.timeline, localid.dboid,
+        remote_dsn, apply_delay, replication_sets, true
+    );
+
+    -- Now ensure the per-db worker is started if it's not already running.
+    -- This won't actually take effect until commit time, it just adds a commit
+    -- hook to start the worker when we commit.
+    PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_subscribe(text,text,integer,text[])
+IS 'Subscribe to remote logical changes';
+
+CREATE FUNCTION bdr.bdr_node_join_wait_for_ready()
+RETURNS void LANGUAGE plpgsql VOLATILE AS $body$
+DECLARE
+    _node_status "char";
+BEGIN
+    IF current_setting('transaction_isolation') <> 'read committed' THEN
+        RAISE EXCEPTION 'Can only wait for node join in an ISOLATION LEVEL READ COMMITTED transaction, not %',
+                        current_setting('transaction_isolation');
+    END IF;
+
+    LOOP
+        SELECT INTO _node_status
+          node_status
+        FROM bdr.bdr_nodes
+        WHERE (node_sysid, node_timeline, node_dboid)
+              = bdr.bdr_get_local_nodeid();
+
+    PERFORM pg_sleep(0.5);
+
+        EXIT WHEN _node_status = 'r';
+    END LOOP;
+END;
+$body$;
+
+CREATE FUNCTION bdr_upgrade_to_090(my_conninfo cstring, local_conninfo cstring, remote_conninfo cstring)
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring) FROM public;
+
+COMMENT ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring)
+IS 'Upgrade a BDR 0.7.x or 0.8.x node to BDR 0.9.0 dynamic configuration. remote_conninfo is the node to connect to to perform the upgrade, my_conninfo is the dsn for other nodes to connect to this node with, local_conninfo is used to connect locally back to the node. Use null remote conninfo on the first node.';
+
+RESET bdr.permit_unsafe_ddl_commands;
+RESET bdr.skip_ddl_replication;
+RESET search_path;
index 50eb1c20fc93ee002bd881861e6810c694080ab3..09459fa7791621813105499b94f5ab18053cabac 100644 (file)
@@ -11,7 +11,7 @@
 
 errlog()
 {
-    echo "$@" 1>&2
+   echo "$@" 1>&2
 }
 
 JOBS=1
@@ -23,78 +23,78 @@ PGDUMP=
 PGRESTORE=
 
 while (($i < ${#argv[*]})); do
-    case "${argv[$i]}" in
+   case "${argv[$i]}" in
    -V)
-       echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
-       exit
+       echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
+       exit
    ;;
-        --snapshot)
-            ((i++)); SNAPSHOT="${argv[$i]}"
-        ;;
-        --source)
-            ((i++)); SOURCE="${argv[$i]}"
-        ;;
-        --target)
-            ((i++)); TARGET="${argv[$i]}"
-        ;;
-        --tmp-directory)
-            ((i++)); TMPDIR="${argv[$i]}"
-        ;;
-        --jobs)
-            ((i++)); JOBS="${argv[$i]}"
+   --snapshot)
+       ((i++)); SNAPSHOT="${argv[$i]}"
    ;;
-        --pg-dump-path)
-            ((i++)); PGDUMP="${argv[$i]}"
-        ;;
-        --pg-restore-path)
-            ((i++)); PGRESTORE="${argv[$i]}"
-        ;;
-        --help)
-            errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
-            errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
-            exit 0
-        ;;
-        *)
-            errlog Unknown command-line option: ${argv[$i]}
-            exit 1
-        ;;
-    esac
+   --source)
+       ((i++)); SOURCE="${argv[$i]}"
+   ;;
+   --target)
+       ((i++)); TARGET="${argv[$i]}"
+   ;;
+   --tmp-directory)
+       ((i++)); TMPDIR="${argv[$i]}"
+   ;;
+   --jobs)
+       ((i++)); JOBS="${argv[$i]}"
+   ;;
+   --pg-dump-path)
+       ((i++)); PGDUMP="${argv[$i]}"
+   ;;
+   --pg-restore-path)
+       ((i++)); PGRESTORE="${argv[$i]}"
+   ;;
+   --help)
+       errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
+       errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
+       exit 0
+   ;;
+   *)
+       errlog Unknown command-line option: ${argv[$i]}
+       exit 1
+   ;;
+   esac
 
-    ((i++))
+   ((i++))
 done
 
 if [ -z "$SOURCE" ]; then
-    errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
+   errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
 fi
 
 if [ -z "$TARGET" ]; then
-    errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
+   errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
 fi
 
 if [ -z "$TMPDIR" ]; then
-    errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
+   errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
 fi
 
 if [ -z "$PGDUMP" ]; then
-    errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+   errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
 fi
 
 if [ -z "$PGRESTORE" ]; then
-    errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+   errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
 fi
 
 SNAP=${SNAPSHOT:+"--snapshot $SNAPSHOT"}
 
 errlog "Dumping remote database \"$SOURCE\" with $JOBS concurrent workers to \"$TMPDIR\""
-if ! "$PGDUMP" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
-    errlog "bdr_dump of "$SOURCE" failed, aborting"
-    exit 1
+if ! "$PGDUMP" -T "bdr.bdr_nodes" -T "bdr.bdr_connections" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
+   errlog "bdr_dump of "$SOURCE" failed, aborting"
+   exit 1
 fi
 
 errlog "Restoring dump to local DB \"$TARGET\" with $JOBS concurrent workers from \"$TMPDIR\""
-if ! "$PGRESTORE" --exit-on-error --single-transaction -j $JOBS -F d -d "$TARGET" $TMPDIR; then
-    errlog "pg_restore to "$TARGET" failed, aborting"
-    exit 2
+if ! "$PGRESTORE" --exit-on-error -j $JOBS -F d -d "$TARGET" $TMPDIR; then
+   errlog "pg_restore to "$TARGET" failed, aborting"
+   exit 2
 fi
 
 exit 0
diff --git a/specs/isolation/init.spec b/specs/isolation/init.spec
new file mode 100644 (file)
index 0000000..1b824f0
--- /dev/null
@@ -0,0 +1,127 @@
+conninfo "node1" "dbname=node1"
+conninfo "node2" "dbname=node2"
+conninfo "node3" "dbname=node3"
+
+session "snode1"
+
+# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
+# say how many nodes must be present.  It'll succeed if there are zero nodes.
+# So we first have to wait for enough replication connections.
+#
+# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
+# cached when first accessed so repeat access within the same transaction sees
+# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
+# pg_stat_activity, new walsenders are filtered out by the join unles we force
+# a refresh of pg_stat_activity.
+
+connection "node1"
+
+step "setup1"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_root"
+{
+   SELECT bdr.bdr_group_create(
+       dsn := 'dbname=node1'
+       );
+}
+
+step "wait"
+{
+   -- pg_xlog_wait_remote_apply isn't good enough alone
+   -- as it doesn't permit us to say how many nodes must be present.
+   -- It'll succeed if there are zero nodes. So we first have to wait
+   -- for enough replication connections.
+   DO $$
+   DECLARE
+       nodecount integer := 0;
+       target_lsn pg_lsn;
+   BEGIN
+       WHILE nodecount <> 6
+       LOOP
+           PERFORM pg_sleep(1);
+           PERFORM pg_stat_clear_snapshot();
+           -- Now find out how many walsenders are running
+           nodecount := (SELECT count(*)
+                         FROM pg_catalog.pg_stat_replication);
+           RAISE NOTICE 'Found % nodes',nodecount;
+       END LOOP;
+       -- OK, all nodes seen, now we wait for catchup on them all.
+       target_lsn := pg_current_xlog_location();
+       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+       RAISE NOTICE 'Catchup to LSN completed';
+   END;
+   $$;
+}
+
+session "snode2"
+connection "node2"
+
+step "setup2"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_2"
+{
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node2',
+       init_from_dsn := 'dbname=node1'
+       );
+}
+
+step "wait_join_2"
+{
+   SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_2"
+{
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+session "snode3"
+connection "node3"
+
+step "setup3"
+{
+   CREATE EXTENSION btree_gist;
+   CREATE EXTENSION bdr;
+}
+
+
+step "join_3"
+{
+   SELECT bdr.bdr_group_join(
+       dsn := 'dbname=node3',
+       init_from_dsn := 'dbname=node1',
+       local_dsn := 'dbname=node3'
+       );
+}
+
+step "wait_join_3"
+{
+   SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_3"
+{
+   SELECT pg_stat_clear_snapshot();
+   SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+   SELECT count(*) FROM pg_stat_replication;
+   SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+   SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+permutation "setup1" "setup2" "setup3" "join_root" "join_2" "wait_join_2" "check_join_2" "join_3" "wait_join_3" "check_join_3" "wait"
diff --git a/specs/isolation/waitforstart.spec b/specs/isolation/waitforstart.spec
deleted file mode 100644 (file)
index b5ef85c..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-conninfo "node1" "dbname=node1"
-conninfo "node2" "dbname=node2"
-conninfo "node3" "dbname=node3"
-
-session "snode1"
-
-# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
-# say how many nodes must be present.  It'll succeed if there are zero nodes.
-# So we first have to wait for enough replication connections.
-#
-# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
-# cached when first accessed so repeat access within the same transaction sees
-# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
-# pg_stat_activity, new walsenders are filtered out by the join unles we force
-# a refresh of pg_stat_activity.
-
-step "wait"
-{
-   DO $$
-   DECLARE
-       nodecount integer := 0;
-       target_lsn pg_lsn;
-   BEGIN
-       WHILE nodecount <> 6
-       LOOP
-           PERFORM pg_sleep(1);
-           PERFORM pg_stat_clear_snapshot();
-           -- Now find out how many walsenders are running
-           nodecount := (SELECT count(*)
-                         FROM pg_catalog.pg_stat_replication);
-           RAISE NOTICE 'Found % nodes',nodecount;
-       END LOOP;
-       -- OK, all nodes seen, now we wait for catchup on them all.
-       target_lsn := pg_current_xlog_location();
-       RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
-       PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
-       RAISE NOTICE 'Catchup to LSN completed';
-   END;
-   $$;
-}
-
-permutation "wait"
index e47c6f180b489c399613cb6ae450463ede7245fc..19140e54e59feb8adcb4a5d6a561191f9f728aa0 100644 (file)
@@ -212,6 +212,7 @@ CREATE TABLE tbl_with_oids() WITH OIDS;
 CREATE TABLE tbl_without_oids() WITHOUT oids;
 DROP TABLE tbl_without_oids;
 SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
 
 --- AGGREGATE ---
 \c postgres
index 10271198731d4458481e9d7f9cb33bee084d2a21..1a7c1de30f8000257436b3de6840d00c1f398ed7 100644 (file)
@@ -12,37 +12,10 @@ GRANT ALL ON SCHEMA public TO nonsuper;
 \c regression
 GRANT ALL ON SCHEMA public TO nonsuper;
 
-SELECT pg_sleep(10);
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
-   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
-   IF FOUND THEN
-       RETURN;
-   END IF;
-
-   PERFORM bdr.bdr_replicate_ddl_command($DDL$
-       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
-       AS $FUNC$
-       BEGIN
-           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
-               PERFORM pg_sleep(0.01);
-           END LOOP;
-       END;$FUNC$ LANGUAGE plpgsql;
-   $DDL$);
-END;$DO$;
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
 
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
-    OUT readdb1 text,
-    OUT readdb2 text,
-    OUT writedb1 text,
-    OUT writedb2 text
-    ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
-    current_setting('bdrtest.readdb1'),
-    current_setting('bdrtest.readdb2'),
-    current_setting('bdrtest.writedb1'),
-    current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
diff --git a/sql/init_bdr.sql b/sql/init_bdr.sql
new file mode 100644 (file)
index 0000000..6f584bc
--- /dev/null
@@ -0,0 +1,44 @@
+\c postgres
+SELECT bdr.bdr_group_create(
+   dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-1']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+\c regression
+SELECT bdr.bdr_group_join(
+   dsn := 'dbname=regression',
+   init_from_dsn := 'dbname=postgres',
+   local_dsn := 'dbname=regression',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
diff --git a/sql/init_udr.sql b/sql/init_udr.sql
new file mode 100644 (file)
index 0000000..2e3a62b
--- /dev/null
@@ -0,0 +1,52 @@
+\c postgres
+SELECT bdr.bdr_subscribe(
+   remote_dsn := 'dbname=regression',
+   local_dsn := 'dbname=postgres',
+   replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+   );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+   PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+   IF FOUND THEN
+       RETURN;
+   END IF;
+
+   PERFORM bdr.bdr_replicate_ddl_command($DDL$
+       CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+       AS $FUNC$
+       BEGIN
+           WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+               PERFORM pg_sleep(0.01);
+           END LOOP;
+       END;$FUNC$ LANGUAGE plpgsql;
+   $DDL$);
+END;$DO$;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+    OUT readdb1 text,
+    OUT readdb2 text,
+    OUT writedb1 text,
+    OUT writedb2 text
+    ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+    current_setting('bdrtest.readdb1'),
+    current_setting('bdrtest.readdb2'),
+    current_setting('bdrtest.writedb1'),
+    current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
index 648cb754ef35c555f4ebe5216169ca050defead0..b0a4209511041c2a99e4bc09ecfcb42596ff21c0 100644 (file)
@@ -36,6 +36,9 @@ DROP EXTENSION bdr;
 CREATE EXTENSION bdr VERSION '0.9.0.0';
 DROP EXTENSION bdr;
 
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
+
 -- evolve version one by one from the oldest to the newest one
 CREATE EXTENSION bdr VERSION '0.8.0';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
@@ -46,6 +49,7 @@ ALTER EXTENSION bdr UPDATE TO '0.8.0.5';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
 ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
 ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
 
 
 -- Should never have to do anything: You missed adding the new version above.
diff --git a/sql/upgrade_sim_0800.sql b/sql/upgrade_sim_0800.sql
new file mode 100644 (file)
index 0000000..c84f70f
--- /dev/null
@@ -0,0 +1,183 @@
+--
+-- Attempt to simulate an upgrade from BDR 0.8.0 to the current
+-- version.
+--
+-- 0.8.0 used GUCs for bdr.connections DSN configuration, etc. We can manually
+-- create the slots, replication identifiers, and bdr.bdr_nodes entries as if
+-- this was a 0.8.0 DB just about to be upgraded, then upgrade the extension
+-- and execute the upgrade process.
+--
+
+
+CREATE DATABASE upgrade_sim_0800_a;
+CREATE DATABASE upgrade_sim_0800_b;
+
+\c upgrade_sim_0800_a;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_a      --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+-- public.bdr_get_local_nodeid() is defined in the bdr ext's C lib
+-- exposed in 0.8.0's SQL extension. We have to use it to create
+-- the required slots etc, so create it in public.
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+   id integer primary key,
+   somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       ''
+   )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+   sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+    public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       ''
+   ),
+   'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+
+\c upgrade_sim_0800_b;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_b      --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+   id integer primary key,
+   somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       ''
+   )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+   sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+    public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+   format('bdr_%s_%s_%s_%s__%s',
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+       sysid, timeline,
+       (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+       ''
+   ),
+   'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+------------------------------------------
+-- Test the upgrade                     --
+------------------------------------------
+--
+-- We now have two databases that look like they were running BDR, with
+-- contents in sync at the time of upgrade. The origin replication identifier
+-- information is wrong as both have InvalidRepNodeId but we don't really care
+-- about that. It's as if we deleted bdr.bdr_connections then started the DB
+-- up.
+--
+-- Time to upgrade to dynconf. Hope this works!
+--
+
+-- First the extension must be updated on BOTH nodes
+\c upgrade_sim_0800_a
+ALTER EXTENSION bdr UPDATE;
+\c upgrade_sim_0800_b
+ALTER EXTENSION bdr UPDATE;
+
+
+-- then one must be upgraded standalone. For this one we'll provide no local
+-- dsn; it must be inferred from the node dsn in that case. There's also no
+-- remote DSN since it's the first node.
+\c upgrade_sim_0800_a
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_a', NULL, NULL);
+
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+  AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+
+-- Upgrade the second node using the first node. This time we'll
+-- supply a local dsn too, though it'll be the same.
+\c upgrade_sim_0800_b
+
+-- must have old nodes, no replication can have occurred
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_a');
+
+-- local node must be updated. Remote node could be either as replication
+-- might or might not have sent it yet.
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+WHERE datname = current_database()
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+  AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+-- TODO: wait for remote apply, switch back
+
+-- TODO: use test table
+
+-- TODO: lots of failure cases