extsql/bdr--0.8.0.4--0.8.0.5.sql \
extsql/bdr--0.8.0.5--0.8.0.6.sql \
extsql/bdr--0.8.0.6--0.8.0.7.sql \
- extsql/bdr--0.8.0.7--0.9.0.0.sql
+ extsql/bdr--0.8.0.7--0.9.0.0.sql \
+ extsql/bdr--0.9.0.0--0.9.0.1.sql
DATA_built = \
extsql/bdr--0.8.0.1.sql \
extsql/bdr--0.8.0.5.sql \
extsql/bdr--0.8.0.6.sql \
extsql/bdr--0.8.0.7.sql \
- extsql/bdr--0.9.0.0.sql
+ extsql/bdr--0.9.0.0.sql \
+ extsql/bdr--0.9.0.1.sql
DOCS = bdr.conf.sample README.bdr
SCRIPTS = scripts/bdr_initial_load bdr_init_copy bdr_resetxlog bdr_dump
bdr_conflict_handlers.o \
bdr_conflict_logging.o \
bdr_commandfilter.o \
+ bdr_common.o \
bdr_compat.o \
bdr_count.o \
bdr_executor.o \
bdr_locks.o \
bdr_output.o \
bdr_relcache.o \
- bdr_remotecalls.o
+ bdr_remotecalls.o \
+ bdr_supervisor.o \
+ bdr_upgrade.o
ifeq "@BUILDING_BDR@" "1"
OBJS += \
$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(libpq_pgport) $(LIBS) -o $@$(X)
scripts/bdr_initial_load: scripts/bdr_initial_load.in
- mkdir -p scripts
sed -e "s/BDR_VERSION/$(BDR_VERSION)/" -e "s/PG_VERSION/$(VERSION)/" $< > $@
extsql/bdr--0.8.0.1.sql: extsql/bdr--0.8.0.sql extsql/bdr--0.8.0--0.8.0.1.sql
cat $^ > $@
extsql/bdr--0.9.0.0.sql: extsql/bdr--0.8.0.7.sql extsql/bdr--0.8.0.7--0.9.0.0.sql
+ mkdir -p extsql
+ cat $^ > $@
+
+extsql/bdr--0.9.0.1.sql: extsql/bdr--0.9.0.0.sql extsql/bdr--0.9.0.0--0.9.0.1.sql
+ mkdir -p extsql
cat $^ > $@
bdr_resetxlog: pg_resetxlog.o
DDLREGRESSCHECKS=ddl/create ddl/alter_table ddl/extension ddl/function \
ddl/grant ddl/mixed ddl/namespace ddl/replication_set \
ddl/sequence ddl/view
+REGRESSINIT=init_bdr
else
check: regresscheck
DDLREGRESSCHECKS=
+REGRESSINIT=init_udr
endif
+
REGRESSCHECKS= \
init \
+ $(REGRESSINIT) \
upgrade \
identifier \
$(DDLREGRESSCHECKS) \
ISOLATIONCHECKS=\
- isolation/waitforstart \
+ isolation/init \
isolation/ddlconflict \
isolation/dmlconflict_ii \
isolation/dmlconflict_uu \
/* end externs for bdr apply state */
ResourceOwner bdr_saved_resowner;
-static bool bdr_is_restart = false;
Oid BdrNodesRelid;
Oid BdrConflictHistoryRelId;
Oid BdrLocksRelid;
Oid BdrLocksByOwnerRelid;
Oid BdrReplicationSetConfigRelid;
-BdrConnectionConfig **bdr_connection_configs;
-/* All databases for which BDR is configured, valid after _PG_init */
-char **bdr_distinct_dbnames;
-uint32 bdr_distinct_dbnames_count = 0;
-
/* GUC storage */
static char *connections = NULL;
static bool bdr_synchronous_commit;
int bdr_default_apply_delay;
int bdr_max_workers;
+int bdr_max_databases;
static bool bdr_skip_ddl_replication;
bool bdr_skip_ddl_locking;
bool bdr_do_not_replicate;
/* shortcut for finding the the worker shmem block */
BdrWorkerControl *BdrWorkerCtl = NULL;
+/* This worker's block within BdrWorkerCtl - only valid in bdr workers */
+BdrWorker *bdr_worker_slot = NULL;
+
+/* Worker generation number; see bdr_worker_shmem_startup comments */
+static uint16 bdr_worker_generation;
+
+
PG_MODULE_MAGIC;
void _PG_init(void);
static void bdr_worker_shmem_startup(void);
-static void bdr_worker_shmem_create_workers(void);
PGDLLEXPORT Datum bdr_apply_pause(PG_FUNCTION_ARGS);
PGDLLEXPORT Datum bdr_apply_resume(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(bdr_variant);
PG_FUNCTION_INFO_V1(bdr_get_local_nodeid);
-static void
+void
bdr_sigterm(SIGNAL_ARGS)
{
int save_errno = errno;
errno = save_errno;
}
-static void
+void
bdr_sighup(SIGNAL_ARGS)
{
int save_errno = errno;
*
* The replication identifier is allocated in the current memory context.
*/
-void
+static void
bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
Oid remote_dboid, char **out_replication_identifier,
Name out_slot_name)
/* make sure BDR extension exists */
bdr_executor_always_allow_writes(true);
StartTransactionCommand();
- bdr_maintain_schema();
+ bdr_maintain_schema(true);
CommitTransactionCommand();
bdr_executor_always_allow_writes(false);
return streamConn;
}
-/*
- * In postmaster, at shared_preload_libaries time, create the GUCs for a
- * connection. They'll be accessed by the apply worker that uses these GUCs
- * later.
- *
- * Returns false if the config wasn't created for some reason (missing
- * required options, etc); true if it's ok. Out parameters are not changed if
- * false is returned.
- *
- * Params:
- *
- * name
- * Name of this conn - bdr.<name>
- *
- * used_databases
- * Array of char*, names of distinct databases named in configured conns
- *
- * num_used_databases
- * Number of distinct databases named in conns
- *
- * out_config
- * Assigned a palloc'd pointer to GUC storage for this config'd connection
- *
- * out_config is set even if false is returned, as the GUCs have still been
- * created. Test out_config->is_valid to see whether the connection is usable.
- */
-static bool
-bdr_create_con_gucs(char *name,
- char **used_databases,
- Size *num_used_databases,
- char **database_initcons,
- BdrConnectionConfig **out_config)
-{
- Size off;
- char *errormsg = NULL;
- PQconninfoOption *options;
- PQconninfoOption *cur_option;
- BdrConnectionConfig *opts;
-
- /* don't free, referenced by the guc machinery! */
- char *optname_dsn = palloc(strlen(name) + 30);
- char *optname_delay = palloc(strlen(name) + 30);
- char *optname_replica = palloc(strlen(name) + 30);
- char *optname_local_dsn = palloc(strlen(name) + 30);
- char *optname_local_dbname = palloc(strlen(name) + 30);
- char *optname_replication_sets = palloc(strlen(name) + 30);
-
- Assert(process_shared_preload_libraries_in_progress);
-
- /* Ensure the connection name is legal */
- if (strchr(name, '_') != NULL)
- {
- ereport(ERROR,
- (errmsg("bdr.connections entry '%s' contains the '_' character, which is not permitted", name)));
- }
-
- /* allocate storage for connection parameters */
- opts = palloc0(sizeof(BdrConnectionConfig));
- opts->is_valid = false;
- *out_config = opts;
-
- opts->name = pstrdup(name);
-
- /* Define GUCs for this connection */
- sprintf(optname_dsn, "bdr.%s_dsn", name);
- DefineCustomStringVariable(optname_dsn,
- optname_dsn,
- NULL,
- &opts->dsn,
- NULL, PGC_POSTMASTER,
- GUC_NOT_IN_SAMPLE,
- NULL, NULL, NULL);
-
- sprintf(optname_delay, "bdr.%s_apply_delay", name);
- DefineCustomIntVariable(optname_delay,
- optname_delay,
- NULL,
- &opts->apply_delay,
- -1, -1, INT_MAX,
- PGC_SIGHUP,
- GUC_UNIT_MS,
- NULL, NULL, NULL);
-
- sprintf(optname_replica, "bdr.%s_init_replica", name);
- DefineCustomBoolVariable(optname_replica,
- optname_replica,
- NULL,
- &opts->init_replica,
- false,
- PGC_SIGHUP,
- 0,
- NULL, NULL, NULL);
-
- sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
- DefineCustomStringVariable(optname_local_dsn,
- optname_local_dsn,
- NULL,
- &opts->replica_local_dsn,
- NULL, PGC_POSTMASTER,
- GUC_NOT_IN_SAMPLE,
- NULL, NULL, NULL);
-
- sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
- DefineCustomStringVariable(optname_local_dbname,
- optname_local_dbname,
- NULL,
- &opts->dbname,
- NULL, PGC_POSTMASTER,
- GUC_NOT_IN_SAMPLE,
- NULL, NULL, NULL);
-
- sprintf(optname_replication_sets, "bdr.%s_replication_sets", name);
- DefineCustomStringVariable(optname_replication_sets,
- optname_replication_sets,
- NULL,
- &opts->replication_sets,
- NULL, PGC_POSTMASTER,
- GUC_LIST_INPUT | GUC_LIST_QUOTE,
- NULL, NULL, NULL);
-
-
- if (!opts->dsn)
- {
- elog(WARNING, "bdr %s: no connection information", name);
- return false;
- }
-
- elog(DEBUG2, "bdr %s: dsn=%s", name, opts->dsn);
-
- options = PQconninfoParse(opts->dsn, &errormsg);
- if (errormsg != NULL)
- {
- char *str = pstrdup(errormsg);
-
- PQfreemem(errormsg);
- ereport(ERROR,
- (errcode(ERRCODE_CONFIG_FILE_ERROR),
- errmsg("bdr %s: error in dsn: %s", name, str)));
- }
-
- if (opts->dbname == NULL)
- {
- cur_option = options;
- while (cur_option->keyword != NULL)
- {
- if (strcmp(cur_option->keyword, "dbname") == 0)
- {
- if (cur_option->val == NULL)
- ereport(ERROR,
- (errcode(ERRCODE_CONFIG_FILE_ERROR),
- errmsg("bdr %s: no dbname set", name)));
-
- opts->dbname = pstrdup(cur_option->val);
- elog(DEBUG2, "bdr %s: dbname=%s", name, opts->dbname);
- }
-
- if (cur_option->val != NULL)
- {
- elog(DEBUG3, "bdr %s: opt %s, val: %s",
- name, cur_option->keyword, cur_option->val);
- }
- cur_option++;
- }
- }
-
- /* cleanup */
- PQconninfoFree(options);
-
- /*
- * If this is a DB name we haven't seen yet, add it to our set of known
- * DBs.
- */
- for (off = 0; off < *num_used_databases; off++)
- {
- if (strcmp(opts->dbname, used_databases[off]) == 0)
- break;
- }
-
- if (off == *num_used_databases)
- {
- /* Didn't find a match, add new db name */
- used_databases[(*num_used_databases)++] =
- pstrdup(opts->dbname);
- elog(DEBUG2, "bdr %s: Saw new database %s, now %i known dbs",
- name, opts->dbname, (int)(*num_used_databases));
- }
-
- /*
- * Make sure that at most one of the worker configs for each DB can be
- * configured to run initialization.
- */
- if (opts->init_replica)
- {
- elog(DEBUG2, "bdr %s: has init_replica=t", name);
- if (database_initcons[off] != NULL)
- ereport(ERROR,
- (errcode(ERRCODE_CONFIG_FILE_ERROR),
- errmsg("Connections %s and %s on database %s both have bdr_init_replica enabled, cannot continue",
- name, database_initcons[off], used_databases[off])));
- else
- database_initcons[off] = name; /* no need to pstrdup, see _PG_init */
- }
-
- opts->is_valid = true;
-
- /* optname vars intentionally leaked, see above */
- return true;
-}
-
static size_t
bdr_worker_shmem_size()
{
/* Init shm segment header after postmaster start or restart */
memset(BdrWorkerCtl, 0, bdr_worker_shmem_size());
BdrWorkerCtl->lock = LWLockAssign();
+ /* Assigned on supervisor launch */
+ BdrWorkerCtl->supervisor_latch = NULL;
/*
- * Now that the shm segment is initialized, we can populate it with
- * BdrWorker entries for the connections we created GUCs for during
- * _PG_init.
+ * The postmaster keeps track of a generation number for BDR workers
+ * and increments it at each restart.
+ *
+ * Background workers aren't unregistered when the postmaster restarts
+ * and clears shared memory, so after a restart the supervisor and
+ * per-db workers have no idea what workers are/aren't running, nor any
+ * way to control them. To make a clean BDR restart possible the
+ * workers registered before the restart need to find out about the
+ * restart and terminate.
+ *
+ * To make that possible we pass the generation number to the worker
+ * in its main argument, and also set it in shared memory. The two
+ * must match. If they don't, the worker will proc_exit(0), causing its
+ * self to be unregistered.
*
- * We must do this whether it's initial launch or a postmaster restart,
- * as shmem gets cleared on postmaster restart.
+ * This should really be part of the bgworker API its self, handled via
+ * a BGW_NO_RESTART_ON_CRASH flag or by providing a generation number
+ * as a bgworker argument. However, for now we're stuck with this
+ * workaround.
*/
- bdr_worker_shmem_create_workers();
+ if (bdr_worker_generation == UINT16_MAX)
+ /* We could handle wrap-around, but really ... */
+ elog(FATAL, "Too many postmaster crash/restart cycles. Restart the PostgreSQL server.");
+
+ BdrWorkerCtl->worker_generation = ++bdr_worker_generation;
}
LWLockRelease(AddinShmemInitLock);
*/
}
-/*
- * After _PG_init we've read the GUCs for the workers but haven't populated the
- * shared memory segment at BdrWorkerCtl with BDRWorker entries yet.
- *
- * The shm segment is initialized now, so do that.
- */
-static void
-bdr_worker_shmem_create_workers(void)
-{
- uint32 off;
-
- /*
- * Create a BdrPerdbWorker for each distinct database found during
- * _PG_init. The bgworker for each has already been registered and assigned
- * a slot position during _PG_init, but the slot doesn't have anything
- * useful in it yet. Because it was already registered we don't need
- * any protection against duplicate launches on restart here.
- *
- * Because these slots are pre-assigned before shmem is bought up they
- * MUST be reserved first, before any shmem entries are allocated, so
- * they get the first slots.
- *
- * When started, this worker will continue setup - doing any required
- * initialization of the database, then registering dynamic bgworkers for
- * the DB's individual BDR connections.
- *
- * If we ever want to support dynamically adding/removing DBs from BDR at
- * runtime, this'll need to move into a static bgworker because dynamic
- * bgworkers can't be launched directly from the postmaster. We'll need a
- * "bdr manager" static bgworker.
- */
-
- for (off = 0; off < bdr_distinct_dbnames_count; off++)
- {
- BdrWorker *shmworker;
- BdrPerdbWorker *perdb;
- uint32 ctl_idx;
-
- shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_PERDB, &ctl_idx);
- Assert(shmworker->worker_type == BDR_WORKER_PERDB);
- /*
- * The workers have already been assigned shmem indexes during
- * _PG_init, so they MUST get the same index here. So long as these
- * entries are assigned before any other shmem slots they will.
- */
- Assert(ctl_idx == off);
- perdb = &shmworker->data.perdb;
-
- strncpy(NameStr(perdb->dbname), bdr_distinct_dbnames[off], NAMEDATALEN);
- NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
-
- perdb->nnodes = 0;
- perdb->seq_slot = off;
-
- elog(DEBUG1, "Assigning shmem bdr database worker for db %s",
- NameStr(perdb->dbname));
- }
-
- /*
- * Populate shmem with a BdrApplyWorker for each valid BdrConnectionConfig
- * found during _PG_init so that the per-db worker will register it for
- * startup after performing any BDR initialisation work.
- *
- * Use of shared memory for this is required for EXEC_BACKEND (windows)
- * where we can't share postmaster memory, and for when we're launching a
- * bgworker from another bgworker where the fork() from postmaster doesn't
- * provide access to the launching bgworker's memory.
- *
- * The workers aren't actually launched here, they get launched by
- * launch_apply_workers(), called by the database's per-db static worker.
- */
- for (off = 0; off < bdr_max_workers; off++)
- {
- BdrConnectionConfig *cfg = bdr_connection_configs[off];
- BdrWorker *shmworker;
- BdrApplyWorker *worker;
- int i;
- bool found_perdb = false;
-
- if (cfg == NULL || !cfg->is_valid)
- continue;
-
- shmworker = (BdrWorker *) bdr_worker_shmem_alloc(BDR_WORKER_APPLY, NULL);
- Assert(shmworker->worker_type == BDR_WORKER_APPLY);
- worker = &shmworker->data.apply;
- worker->connection_config_idx = off;
- worker->replay_stop_lsn = InvalidXLogRecPtr;
- worker->forward_changesets = false;
-
- /*
- * Now search for the perdb worker belonging to this slot.
- */
- for (i = 0; i < bdr_max_workers; i++)
- {
- BdrPerdbWorker *perdb;
- BdrWorker *entry = &BdrWorkerCtl->slots[i];
-
- if (entry->worker_type != BDR_WORKER_PERDB)
- continue;
-
- perdb = &entry->data.perdb;
-
- if (strcmp(NameStr(perdb->dbname), cfg->dbname) != 0)
- continue;
-
- /*
- * Remember how many connections there are for this node. This
- * will, e.g., be used to determine the quorum for ddl locks and
- * sequencer votes.
- */
- perdb->nnodes++;
- found_perdb = true;
- worker->perdb_worker_off = i;
- break;
- }
-
- if (!found_perdb)
- elog(ERROR, "couldn't find perdb entry for apply worker");
-
- /*
- * If this is a postmaster restart, don't register the worker a second
- * time when the per-db worker starts up.
- */
- worker->bgw_is_registered = bdr_is_restart;
- }
-
- /*
- * Make sure that we don't register workers if the postmaster restarts and
- * clears shmem, by keeping a record that we've asked for registration once
- * already.
- */
- bdr_is_restart = true;
-
- /*
- * We might need to re-populate shared memory after a postmaster restart.
- * So we don't free the bdr_startup_context or its contents.
- */
-}
-
/*
* Allocate a block from the bdr_worker shm segment in BdrWorkerCtl, or ERROR
* ctl_idx, if passed, is set to the index of the worker within BdrWorkerCtl.
*
* To release a block, use bdr_worker_shmem_release(...)
+ *
+ * You must hold BdrWorkerCtl->lock in LW_EXCLUSIVE mode for
+ * this call.
*/
BdrWorker*
bdr_worker_shmem_alloc(BdrWorkerType worker_type, uint32 *ctl_idx)
{
int i;
- LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+ Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
for (i = 0; i < bdr_max_workers; i++)
{
BdrWorker *new_entry = &BdrWorkerCtl->slots[i];
{
memset(new_entry, 0, sizeof(BdrWorker));
new_entry->worker_type = worker_type;
- LWLockRelease(BdrWorkerCtl->lock);
if (ctl_idx)
*ctl_idx = i;
return new_entry;
}
}
- LWLockRelease(BdrWorkerCtl->lock);
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("No free bdr worker slots - bdr.max_workers is too low")));
void
_PG_init(void)
{
- List *connames;
- ListCell *c;
MemoryContext old_context;
- char *connections_tmp;
-
- char **used_databases;
- char **database_initcons;
- Size num_used_databases = 0;
- int connection_config_idx;
- BackgroundWorker bgw;
- uint32 off;
if (!process_shared_preload_libraries_in_progress)
ereport(ERROR,
errmsg("bdr requires \"track_commit_timestamp\" to be enabled")));
#endif
+ /*
+ * _PG_init only runs on first load, not on postmaster restart, so
+ * set the worker generation here. See bdr_worker_shmem_startup.
+ *
+ * It starts at 1 because the postmaster zeroes shmem on restart, so 0 can
+ * mean "just restarted, hasn't run shmem setup callback yet".
+ */
+ bdr_worker_generation = 1;
+
/*
* Force btree_gist to be loaded - its absolutely not required at this
* point, but since it's required for BDR to be used it's much easier to
* memory array.
*/
DefineCustomIntVariable("bdr.max_workers",
- "max number of bdr connections + distinct databases. -1 auto-calculates.",
+ "max number of bdr connections + distinct databases.",
NULL,
&bdr_max_workers,
- -1, -1, 100,
+ 20, 2, 100,
+ PGC_POSTMASTER,
+ 0,
+ NULL, NULL, NULL);
+
+ DefineCustomIntVariable("bdr.max_databases",
+ "max number of distinct databases on which BDR may be active",
+ NULL,
+ &bdr_max_databases,
+ -1, -1, 50,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
0,
NULL, NULL, NULL);
- DefineCustomBoolVariable("bdr.init_from_basedump",
- "Internal. Set during local initialization from basebackup only",
- NULL,
- &bdr_init_from_basedump,
- false,
- PGC_BACKEND,
- 0,
- NULL, NULL, NULL);
-
DefineCustomBoolVariable("bdr.do_not_replicate",
"Internal. Set during local initialization from basebackup only",
NULL,
bdr_label_init();
- /* if nothing is configured, we're done */
- if (connections == NULL)
- {
- /* If worker count autoconfigured, use zero */
- if (bdr_max_workers == -1)
- bdr_max_workers = 0;
- goto out;
- }
-
- /* Copy 'connections' guc so SplitIdentifierString can modify it in-place */
- connections_tmp = pstrdup(connections);
-
- /* Get the list of BDR connection names to iterate over. */
- if (!SplitIdentifierString(connections_tmp, ',', &connames))
- {
- /* syntax error in list */
- ereport(FATAL,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("invalid list syntax for \"bdr.connections\"")));
- }
-
- /*
- * If bdr.max_connections is -1, the default, auto-set it with the
- * most workers we might need with the current number of connections
- * configured. Per-db workers are due to use shmem too, so we might
- * have up to one per-db worker for each configured connection if
- * each is on a different DB.
- */
- if (bdr_max_workers == -1)
- {
- bdr_max_workers = list_length(connames) * 3;
- elog(DEBUG1, "bdr: bdr_max_workers unset, configuring for %d workers",
- bdr_max_workers);
- }
+ bdr_supervisor_register();
/*
* Sanity check max_worker_processes to make sure it's at least big enough
errhint("Set max_worker_processes to at least %d", bdr_max_workers)));
}
+ /*
+ * If bdr.max_databases is not explicitly specified, assume the worst case
+ * of many DBs with one connection per DB.
+ */
+ if (bdr_max_databases == -1)
+ {
+ bdr_max_databases = bdr_max_workers / 2;
+ elog(DEBUG1, "Autoconfiguring bdr.max_databases to %d (bdr.max_workers/2)",
+ bdr_max_databases);
+ }
+
/*
* Allocate a shared memory segment to store the bgworker connection
* information we must pass to each worker we launch.
*/
bdr_worker_alloc_shmem_segment();
- /* Allocate space for BDR connection GUCs */
- bdr_connection_configs = (BdrConnectionConfig**)
- palloc0(bdr_max_workers * sizeof(BdrConnectionConfig*));
-
- /* Names of all databases we're going to be doing BDR for */
- used_databases = palloc0(sizeof(char *) * list_length(connames));
- /*
- * For each db named in used_databases, the corresponding index is the name
- * of the conn with bdr_init_replica=t if any.
- */
- database_initcons = palloc0(sizeof(char *) * list_length(connames));
-
- /*
- * Read all connections, create/validate parameters for them and do sanity
- * checks as we go.
- */
- connection_config_idx = 0;
- foreach(c, connames)
- {
- char *name;
- name = (char *) lfirst(c);
-
- if (!bdr_create_con_gucs(name, used_databases, &num_used_databases,
- database_initcons,
- &bdr_connection_configs[connection_config_idx]))
- continue;
-
- Assert(bdr_connection_configs[connection_config_idx] != NULL);
- connection_config_idx++;
- }
-
- /*
- * Free the connames list cells. The strings are just pointers into
- * 'connections' and must not be freed'd.
- */
- list_free(connames);
- connames = NIL;
-
- /*
- * We've ensured there are no duplicate init connections, no need to
- * remember which conn is the bdr_init_replica conn anymore. The contents
- * are just pointers into connections_tmp so we don't want to free them.
- */
- pfree(database_initcons);
-
- /*
- * Copy the list of used databases into a global where we can
- * use it for registering the per-database workers during shmem init.
- */
- bdr_distinct_dbnames = palloc(sizeof(char*)*num_used_databases);
- memcpy(bdr_distinct_dbnames, used_databases,
- sizeof(char*)*num_used_databases);
- bdr_distinct_dbnames_count = num_used_databases;
- pfree(used_databases);
- num_used_databases = 0;
- used_databases = NULL;
-
- /*
- * Register the per-db workers and assign them an index in shmem. The
- * memory doesn't actually exist yet, it'll be allocated in shmem init.
- *
- * No protection against multiple launches is requried because this
- * only runs once, in _PG_init.
- */
- bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
- BGWORKER_BACKEND_DATABASE_CONNECTION;
- bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
- bgw.bgw_main = NULL;
- strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
- strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
- bgw.bgw_restart_time = 5;
- bgw.bgw_notify_pid = 0;
- for (off = 0; off < bdr_distinct_dbnames_count; off++)
- {
- snprintf(bgw.bgw_name, BGW_MAXLEN,
- "bdr: %s", bdr_distinct_dbnames[off]);
- /*
- * This index into BdrWorkerCtl shmem hasn't been populated yet. It'll
- * be set up in bdr_worker_shmem_create_workers .
- */
- bgw.bgw_main_arg = Int32GetDatum(off);
- RegisterBackgroundWorker(&bgw);
- }
-
EmitWarningsOnPlaceholders("bdr");
- pfree(connections_tmp);
-
-out:
-
/*
* initialize other modules that need shared memory
- *
- * Do so even if we haven't any remote nodes setup, the shared memory might
- * still be needed for some sql callable functions or such.
*/
/* register a slot for every remote node */
bdr_count_shmem_init(bdr_max_workers);
bdr_executor_init();
#ifdef BUILDING_BDR
- bdr_sequencer_shmem_init(bdr_max_workers, bdr_distinct_dbnames_count);
+ bdr_sequencer_shmem_init(bdr_max_workers, bdr_max_databases);
#endif
- bdr_locks_shmem_init(bdr_distinct_dbnames_count);
+ bdr_locks_shmem_init();
/* Set up a ProcessUtility_hook to stop unsupported commands being run */
init_bdr_commandfilter();
* Concurrent executions will block, but not fail.
*
* Must be called inside transaction.
+ *
+ * If update_extensions is true, ALTER EXTENSION commands will be issued to
+ * ensure the required extension(s) are at the current version.
*/
void
-bdr_maintain_schema(void)
+bdr_maintain_schema(bool update_extensions)
{
Relation extrel;
Oid btree_gist_oid;
btree_gist_oid = get_extension_oid("btree_gist", true);
bdr_oid = get_extension_oid("bdr", true);
- /* create required extension if they don't exists yet */
if (btree_gist_oid == InvalidOid)
- {
- CreateExtensionStmt create_stmt;
+ elog(ERROR, "btree_gist is required by BDR but not installed in the current database");
- create_stmt.if_not_exists = false;
- create_stmt.options = NIL;
- create_stmt.extname = (char *)"btree_gist";
- CreateExtension(&create_stmt);
- }
- else
+ if (bdr_oid == InvalidOid)
+ elog(ERROR, "bdr extension is not installed in the current database");
+
+ if (update_extensions)
{
AlterExtensionStmt alter_stmt;
alter_stmt.options = NIL;
alter_stmt.extname = (char *)"btree_gist";
ExecAlterExtensionStmt(&alter_stmt);
- }
-
- if (bdr_oid == InvalidOid)
- {
- CreateExtensionStmt create_stmt;
-
- create_stmt.if_not_exists = false;
- create_stmt.options = NIL;
- create_stmt.extname = (char *)"bdr";
- CreateExtension(&create_stmt);
- }
- else
- {
- AlterExtensionStmt alter_stmt;
/* TODO: only do this if necessary */
alter_stmt.options = NIL;
# bdr extension
comment = 'Bi-directional replication for PostgreSQL'
-default_version = '0.9.0.0'
+default_version = '0.9.0.1'
module_pathname = '$libdir/bdr'
relocatable = false
requires = btree_gist
#include "postmaster/bgworker.h"
#include "replication/logical.h"
#include "utils/resowner.h"
+#include "storage/latch.h"
#include "storage/lock.h"
#include "libpq-fe.h"
*/
typedef struct BdrApplyWorker
{
+ /* oid of the database this worker is applying changes to */
+ Oid dboid;
+
/*
- * Index in bdr_connection_configs of this workers's GUCs
- * and config info (including dbname, name, etc).
+ * Identification for the remote db we're connecting to; used to
+ * find the appropriate bdr.connections row, etc.
*/
- int connection_config_idx;
+ uint64 remote_sysid;
+ TimeLineID remote_timeline;
+ Oid remote_dboid;
/*
* If not InvalidXLogRecPtr, stop replay at this point and exit.
/* Request that the remote forward all changes from other nodes */
bool forward_changesets;
-
- /*
- * Ensure this worker doesn't get registered a second time if there's a
- * perdb worker restart or postmaster restart. Ideally we'd store the
- * BackgroundWorkerHandle, but it's an opaque struct.
- */
- bool bgw_is_registered;
-
- size_t perdb_worker_off;
} BdrApplyWorker;
/*
*/
typedef struct BdrPerdbWorker
{
- /* local database name */
+ /* local database name to connect to */
NameData dbname;
/* number of outgoing connections from this database */
- size_t nnodes;
+ Size nnodes;
size_t seq_slot;
+ /* The perdb worker's latch from the PROC array, for use from other backends */
+ Latch *proclatch;
+
+ /* Oid of the database the worker is attached to - populated after start */
+ Oid database_oid;
} BdrPerdbWorker;
/*
* Type of BDR worker in a BdrWorker struct
+ *
+ * Note that the supervisor worker doesn't appear here, it has its own
+ * dedicated entry in the shmem segment.
*/
typedef enum {
/*
* it's set by memset(...) during shm segment init.
*/
BDR_WORKER_EMPTY_SLOT = 0,
- /* This shm array slot contains data for a */
+ /* This shm array slot contains data for a BdrApplyWorker */
BDR_WORKER_APPLY,
/* This is data for a per-database worker BdrPerdbWorker */
BDR_WORKER_PERDB,
} BdrWorker;
-/*
- * Params for every connection in bdr.connections.
- *
- * Contains n=bdr_max_workers elements, may have NULL entries.
- */
-extern BdrConnectionConfig **bdr_connection_configs;
-
/* GUCs */
extern int bdr_default_apply_delay;
extern int bdr_max_workers;
+extern int bdr_max_databases;
extern char *bdr_temp_dump_directory;
-extern bool bdr_init_from_basedump;
extern bool bdr_log_conflicts_to_table;
extern bool bdr_conflict_logging_include_tuples;
extern bool bdr_permit_unsafe_commands;
{
/* Must hold this lock when writing to BdrWorkerControl members */
LWLockId lock;
+ /* Worker generation number, incremented on postmaster restart */
+ uint16 worker_generation;
/* Set/unset by bdr_apply_pause()/_replay(). */
bool pause_apply;
+ /* Is this the first startup of the supervisor? */
+ bool is_supervisor_restart;
+ /* Latch for the supervisor worker */
+ Latch *supervisor_latch;
/* Array members, of size bdr_max_workers */
BdrWorker slots[FLEXIBLE_ARRAY_MEMBER];
} BdrWorkerControl;
extern BdrWorkerControl *BdrWorkerCtl;
+extern BdrWorker *bdr_worker_slot;
extern ResourceOwner bdr_saved_resowner;
extern Oid BdrReplicationSetConfigRelid;
+/* Structure representing bdr_nodes record */
+typedef struct BDRNodeInfo
+{
+ /* ID */
+ uint64 sysid;
+ TimeLineID timeline;
+ Oid dboid;
+
+ char status;
+
+ char *local_dsn;
+ char *init_from_dsn;
+} BDRNodeInfo;
+
extern Oid bdr_lookup_relid(const char *relname, Oid schema_oid);
+extern void bdr_sequencer_set_nnodes(Size nnodes);
+
+
/* apply support */
extern void bdr_fetch_sysid_via_node_id(RepNodeId node_id, uint64 *sysid,
TimeLineID *tli, Oid *remote_dboid);
PGDLLEXPORT extern Datum bdr_sequence_options(PG_FUNCTION_ARGS);
#endif
+extern int bdr_sequencer_get_next_free_slot(void); //XXX PERDB temp
+
+
/* statistic functions */
-extern void bdr_count_shmem_init(size_t nnodes);
+extern void bdr_count_shmem_init(Size nnodes);
extern void bdr_count_set_current_node(RepNodeId node_id);
extern void bdr_count_commit(void);
extern void bdr_count_rollback(void);
extern bool bdr_get_bigendian(void);
/* initialize a new bdr member */
-extern void bdr_init_replica(Name dbname);
+extern void bdr_init_replica(BDRNodeInfo *local_node);
/* shared memory management */
-extern void bdr_maintain_schema(void);
+extern void bdr_maintain_schema(bool update_extensions);
extern BdrWorker* bdr_worker_shmem_alloc(BdrWorkerType worker_type,
uint32 *ctl_idx);
extern void bdr_worker_shmem_release(BdrWorker* worker, BackgroundWorkerHandle *handle);
extern void bdr_queue_ddl_command(char *command_tag, char *command);
extern void bdr_execute_ddl_command(char *cmdstr, char *perpetrator, bool tx_just_started);
-extern void bdr_locks_shmem_init(Size num_used_databases);
+extern void bdr_locks_shmem_init(void);
extern void bdr_locks_check_query(void);
-/* background workers */
-extern void bdr_worker_init(char* dbname);
+/* background workers and supporting functions for them */
PGDLLEXPORT extern void bdr_apply_main(Datum main_arg);
PGDLLEXPORT extern void bdr_perdb_worker_main(Datum main_arg);
+PGDLLEXPORT extern void bdr_supervisor_worker_main(Datum main_arg);
+
+extern void bdr_worker_init(char* dbname);
+extern void bdr_supervisor_register(void);
+
+extern void bdr_sighup(SIGNAL_ARGS);
+extern void bdr_sigterm(SIGNAL_ARGS);
+
+extern int find_perdb_worker_slot(Oid dboid,
+ BdrWorker **worker_found);
+
+extern void bdr_launch_apply_workers(Oid dboid);
/* Information functions */
extern int bdr_parse_version(const char * bdr_version_str, int *o_major,
int *o_minor, int *o_rev, int *o_subrev);
/* manipulation of bdr catalogs */
-extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli, Oid dboid);
+extern char bdr_nodes_get_local_status(uint64 sysid, TimeLineID tli,
+ Oid dboid);
+extern BDRNodeInfo * bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli,
+ Oid dboid);
+extern void bdr_bdr_node_free(BDRNodeInfo *node);
extern void bdr_nodes_set_local_status(char status);
extern Oid GetSysCacheOidError(int cacheId, Datum key1, Datum key2, Datum key3,
/* helpers shared by multiple worker types */
extern struct pg_conn* bdr_connect(const char *conninfo, Name appname,
- uint64* remote_sysid_i, TimeLineID *remote_tlid_i,
+ uint64* remote_sysid_i,
+ TimeLineID *remote_tlid_i,
Oid *out_dboid_i);
extern struct pg_conn *
TimeLineID *out_timeline,
Oid *out_dboid,
RepNodeId *out_replication_identifier,
- char **out_snapshot);
-extern void
-bdr_build_ident_and_slotname(uint64 remote_sysid, TimeLineID remote_tlid,
- Oid remote_dboid, char **out_replication_identifier,
- Name out_slot_name);
+ char **out_snapshot);
extern PGconn* bdr_connect_nonrepl(const char *connstring,
const char *appnamesuffix);
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
+#include "executor/spi.h"
+
#include "libpq/pqformat.h"
#include "mb/pg_wchar.h"
/*
* This code only runs within an apply bgworker, so we can stash a pointer to our
* state in shm in a global for convenient access.
- *
- * TODO: make static once bdr_apply_main moved into bdr.c
*/
-BdrApplyWorker *bdr_apply_worker = NULL;
+static BdrApplyWorker *bdr_apply_worker = NULL;
-/*
- * GUCs for this apply worker - again, this is fixed for the lifetime of the
- * worker so we can stash it in a global.
- */
-BdrConnectionConfig *bdr_apply_config = NULL;
+static BdrConnectionConfig *bdr_apply_config = NULL;
dlist_head bdr_lsn_association = DLIST_STATIC_INIT(bdr_lsn_association);
replication_origin_xid = remote_xid;
snprintf(statbuf, sizeof(statbuf),
- "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %s, %X/%X, %s",
- bdr_apply_config->name,
+ "bdr_apply: BEGIN origin(source, orig_lsn, timestamp): %X/%X, %s",
(uint32) (origlsn >> 32), (uint32) origlsn,
timestamptz_to_str(committime));
&& bdr_apply_worker->replay_stop_lsn <= end_lsn)
{
ereport(LOG,
- (errmsg("bdr apply %s finished processing; replayed to %X/%X of required %X/%X",
- bdr_apply_config->name,
+ (errmsg("bdr apply finished processing; replayed to %X/%X of required %X/%X",
(uint32)(end_lsn>>32), (uint32)end_lsn,
(uint32)(bdr_apply_worker->replay_stop_lsn>>32), (uint32)bdr_apply_worker->replay_stop_lsn)));
/*
}
}
+
/*
* Entry point for a BDR apply worker.
*
RepNodeId replication_identifier;
XLogRecPtr start_from;
NameData slot_name;
- BdrWorker *bdr_worker_slot;
+ NameData dbname;
+ BdrWorker *perdb;
+ uint32 worker_arg;
+ uint16 apply_worker_idx,
+ worker_generation;
+ int perdb_worker_idx;
Assert(IsBackgroundWorker);
+ worker_arg = DatumGetInt32(main_arg);
+
+ worker_generation = (uint16)(worker_arg >> 16);
+ apply_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+ if (worker_generation != BdrWorkerCtl->worker_generation)
+ {
+ elog(DEBUG1, "apply worker from generation %d exiting after finding shmem generation is %d",
+ worker_generation, BdrWorkerCtl->worker_generation);
+ proc_exit(0);
+ }
+
initStringInfo(&query);
- bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+ bdr_worker_slot = &BdrWorkerCtl->slots[ apply_worker_idx ];
Assert(bdr_worker_slot->worker_type == BDR_WORKER_APPLY);
bdr_apply_worker = &bdr_worker_slot->data.apply;
bdr_worker_type = BDR_WORKER_APPLY;
- bdr_apply_config = bdr_connection_configs[bdr_apply_worker->connection_config_idx];
- Assert(bdr_apply_config != NULL);
-
- bdr_worker_init(bdr_apply_config->dbname);
+ /*
+ * Get the database name to connect to from the perdb worker for this db
+ *
+ * It'd be preferable to just connect by oid, but the bgworkers interface
+ * doesn't permit us to do that, and we can't look up the syscache to find
+ * the name by oid until we're connected.
+ */
+ LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
+ perdb_worker_idx = find_perdb_worker_slot(bdr_apply_worker->dboid, NULL);
+ Assert(perdb_worker_idx >= 0);
+ perdb = &BdrWorkerCtl->slots[perdb_worker_idx];
+ Assert(perdb->worker_type == BDR_WORKER_PERDB);
+ namecpy(&dbname, &perdb->data.perdb.dbname);
+ LWLockRelease(BdrWorkerCtl->lock);
+
+ /* Then unblock signals, connect to the db, etc */
+ bdr_worker_init(NameStr(dbname));
+
+ Assert(MyDatabaseId == bdr_apply_worker->dboid);
+
+ /* Read our connection configuration from the database */
+ bdr_apply_config = bdr_get_connection_config(
+ bdr_apply_worker->remote_sysid,
+ bdr_apply_worker->remote_timeline,
+ bdr_apply_worker->remote_dboid,
+ false);
+
+ Assert(bdr_apply_config->sysid == bdr_apply_worker->remote_sysid &&
+ bdr_apply_config->timeline == bdr_apply_worker->remote_timeline &&
+ bdr_apply_config->dboid == bdr_apply_worker->remote_dboid);
CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr apply top-level resource owner");
bdr_saved_resowner = CurrentResourceOwner;
elog(DEBUG1, "%s initialized on %s",
- MyBgworkerEntry->bgw_name, bdr_apply_config->dbname);
+ MyBgworkerEntry->bgw_name, NameStr(dbname));
/* Set our local application_name for our SPI connections */
resetStringInfo(&query);
appendStringInfo(&query, ", db_encoding '%s'", GetDatabaseEncodingName());
if (bdr_apply_worker->forward_changesets)
appendStringInfo(&query, ", forward_changesets 't'");
+ if (bdr_apply_config->is_unidirectional)
+ appendStringInfo(&query, ", unidirectional 't'");
appendStringInfoChar(&query, ')');
#include "utils/builtins.h"
#include "utils/guc.h"
+#include "utils/memutils.h"
#include "utils/syscache.h"
+static int getattno(const char *colname);
+static char* bdr_textarr_to_identliststr(ArrayType *textarray);
+
+
/* GetSysCacheOid equivalent that errors out if nothing is found */
Oid
GetSysCacheOidError(int cacheId,
Oid argtypes[] = { TEXTOID, OIDOID, OIDOID };
Datum values[3];
bool isnull;
- char status;
+ char status;
char sysid_str[33];
Oid schema_oid;
}
/*
- * Insert a row for the local node's (sysid,tlid,dboid) with the passed status
- * into bdr.bdr_nodes. No existing row for this key may exist.
+ * Get the bdr.bdr_nodes record for the specififed node from the local
+ * bdr.bdr_nodes table via SPI.
*
- * Unlike bdr_set_remote_status, '\0' may not be passed to delete the row, and
- * no upsert is performed. This is a simple insert only.
+ * Returns the status value, or NULL if no such row exists.
*
- * Unlike bdr_nodes_get_local_status, only the status of the local node may
- * be set.
+ * SPI must be initialized, and you must be in a running transaction.
+ */
+BDRNodeInfo *
+bdr_nodes_get_local_info(uint64 sysid, TimeLineID tli, Oid dboid)
+{
+ int spi_ret;
+ Oid argtypes[] = { TEXTOID, OIDOID, OIDOID };
+ Datum values[3];
+ bool isnull;
+ BDRNodeInfo *node;
+ char sysid_str[33];
+ Oid schema_oid;
+ MemoryContext caller_ctx;
+ MemoryContext saved_ctx PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(IsTransactionState());
+
+ /* Save the calling memory context, which we'll allocate results in */
+ caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+ Assert(MemoryContextIsValid(caller_ctx));
+
+ snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+ sysid_str[sizeof(sysid_str)-1] = '\0';
+
+ /*
+ * Determine if BDR is present on this DB. The output plugin can
+ * be started on a db that doesn't actually have BDR active, but
+ * we don't want to allow that.
+ *
+ * Check for a bdr schema.
+ */
+ schema_oid = GetSysCacheOid1(NAMESPACENAME, CStringGetDatum("bdr"));
+ if (schema_oid == InvalidOid)
+ ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("No bdr schema is present in database %s, cannot create a bdr slot",
+ get_database_name(MyDatabaseId)),
+ errhint("There is no bdr.connections entry for this database on the target node or bdr is not in shared_preload_libraries")));
+
+ values[0] = CStringGetTextDatum(sysid_str);
+ values[1] = ObjectIdGetDatum(tli);
+ values[2] = ObjectIdGetDatum(dboid);
+
+ spi_ret = SPI_execute_with_args(
+ "SELECT node_status, node_local_dsn, node_init_from_dsn"
+ " FROM bdr.bdr_nodes"
+ " WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3",
+ 3, argtypes, values, NULL, false, 1);
+
+ if (spi_ret != SPI_OK_SELECT)
+ elog(ERROR, "Unable to query bdr.bdr_nodes, SPI error %d", spi_ret);
+
+ if (SPI_processed == 0)
+ return NULL;
+
+ /* Switch to calling memory context to copy results */
+ saved_ctx = MemoryContextSwitchTo(caller_ctx);
+ Assert(MemoryContextIsValid(saved_ctx));
+
+ node = palloc(sizeof(BDRNodeInfo));
+ node->sysid = sysid;
+ node->timeline = tli;
+ node->dboid = dboid;
+ node->status = DatumGetChar(SPI_getbinval(SPI_tuptable->vals[0],
+ SPI_tuptable->tupdesc, 1,
+ &isnull));
+ node->local_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+ SPI_tuptable->tupdesc, 2);
+ node->init_from_dsn = SPI_getvalue(SPI_tuptable->vals[0],
+ SPI_tuptable->tupdesc, 3);
+
+ if (isnull)
+ elog(ERROR, "bdr.bdr_nodes.status NULL; shouldn't happen");
+
+ return node;
+}
+
+/* Free the BDRNodeInfo pointer including its properties. */
+void
+bdr_bdr_node_free(BDRNodeInfo *node)
+{
+ if (node == NULL)
+ return;
+
+ if (node->local_dsn)
+ pfree(node->local_dsn);
+ if (node->init_from_dsn)
+ pfree(node->init_from_dsn);
+ pfree(node);
+}
+
+/*
+ * Update the status field on the local node (as identified by current
+ * sysid,tlid,dboid) of bdr.bdr_nodes. The node record must already exist.
*
- * SPI must be initialized, and you must be in a running transaction that is
- * not bound to any remote node replication state.
+ * Unlike bdr_nodes_get_local_status, this inteface does not accept
+ * sysid, tlid and dboid input but can only set the status of the local node.
*/
void
bdr_nodes_set_local_status(char status)
Oid argtypes[] = { CHAROID, TEXTOID, OIDOID, OIDOID };
Datum values[4];
char sysid_str[33];
+ bool tx_started = false;
+ bool spi_pushed;
- Assert(status != '\0'); /* Cannot pass \0 to delete */
- Assert(IsTransactionState());
+ Assert(status != '\0'); /* Cannot pass \0 */
/* Cannot have replication apply state set in this tx */
Assert(replication_origin_id == InvalidRepNodeId);
+ if (!IsTransactionState())
+ {
+ tx_started = true;
+ StartTransactionCommand();
+ }
+ spi_pushed = SPI_push_conditional();
+ SPI_connect();
+
snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
GetSystemIdentifier());
sysid_str[sizeof(sysid_str)-1] = '\0';
values[3] = ObjectIdGetDatum(MyDatabaseId);
spi_ret = SPI_execute_with_args(
- "INSERT INTO bdr.bdr_nodes"
- " (node_status, node_sysid, node_timeline, node_dboid)"
- " VALUES ($1, $2, $3, $4);",
+ "UPDATE bdr.bdr_nodes"
+ " SET node_status = $1"
+ " WHERE node_sysid = $2"
+ " AND node_timeline = $3"
+ " AND node_dboid = $4;",
4, argtypes, values, NULL, false, 0);
- if (spi_ret != SPI_OK_INSERT)
- elog(ERROR, "Unable to insert row (status=%c, node_sysid="
+ if (spi_ret != SPI_OK_UPDATE)
+ elog(ERROR, "Unable to set status=%c of row (node_sysid="
UINT64_FORMAT ", node_timeline=%u, node_dboid=%u) "
- "into bdr.bdr_nodes: SPI error %d",
+ "in bdr.bdr_nodes: SPI error %d",
status, GetSystemIdentifier(), ThisTimeLineID,
MyDatabaseId, spi_ret);
+
+ SPI_finish();
+ SPI_pop_conditional(spi_pushed);
+ if (tx_started)
+ CommitTransactionCommand();
}
/*
return GetReplicationIdentifier(ident, false);
}
+/*
+ * Read connection configuration data from the DB and return zero or more
+ * matching palloc'd BdrConnectionConfig results in a list.
+ *
+ * A transaction must be open.
+ *
+ * The list and values are allocated in the calling memory context. By default
+ * this is the transaction memory context, but you can switch to contexts
+ * before calling.
+ *
+ * Each BdrConnectionConfig's char* fields are palloc'd values.
+ *
+ * Uses the SPI, so push/pop caller's SPI state if needed.
+ *
+ * May raise exceptions from queries, SPI errors, etc.
+ *
+ * If both an entry with conn_origin for this node and one with null
+ * conn_origin are found, only the one specific to this node is returned,
+ * as it takes precedence over any generic configuration entry.
+ */
+List*
+bdr_read_connection_configs()
+{
+ HeapTuple tuple;
+ StringInfoData query;
+ int i;
+ int ret;
+ List *configs = NIL;
+ MemoryContext caller_ctx, saved_ctx;
+ char sysid_str[33];
+ Datum values[3];
+ Oid types[3] = { TEXTOID, OIDOID, OIDOID };
+
+ Assert(IsTransactionState());
+
+ /* Save the calling memory context, which we'll allocate results in */
+ caller_ctx = MemoryContextSwitchTo(CurTransactionContext);
+
+ initStringInfo(&query);
+
+ /*
+ * Find a connections row specific to this origin node or if none
+ * exists, the default connection data for that node.
+ *
+ * Configurations for all nodes, including the local node, are read.
+ */
+ appendStringInfo(&query, "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+ " conn_sysid, conn_timeline, conn_dboid, "
+ " conn_dsn, conn_apply_delay, "
+ " conn_replication_sets, "
+ " conn_is_unidirectional, "
+ " conn_origin_dboid <> 0 AS origin_is_my_id "
+ "FROM bdr.bdr_connections "
+ "WHERE (conn_origin_sysid = '0' "
+ " AND conn_origin_timeline = 0 "
+ " AND conn_origin_dboid = 0) "
+ " OR (conn_origin_sysid = $1 "
+ " AND conn_origin_timeline = $2 "
+ " AND conn_origin_dboid = $3) "
+ "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+ " conn_origin_sysid ASC NULLS LAST, "
+ " conn_timeline ASC NULLS LAST, "
+ " conn_dboid ASC NULLS LAST "
+ );
+
+ snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+ sysid_str[sizeof(sysid_str)-1] = '\0';
+
+ values[0] = CStringGetTextDatum(&sysid_str[0]);
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+ SPI_connect();
+
+ ret = SPI_execute_with_args(query.data, 3, types, values, NULL, false, 0);
+
+ if (ret != SPI_OK_SELECT)
+ elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+ /* Switch to calling memory context to copy results */
+ saved_ctx = MemoryContextSwitchTo(caller_ctx);
+
+ for (i = 0; i < SPI_processed; i++)
+ {
+ Datum tmp_datum;
+ bool isnull;
+ ArrayType *conn_replication_sets;
+ char *tmp_sysid;
+
+ BdrConnectionConfig *cfg = palloc(sizeof(BdrConnectionConfig));
+
+ tuple = SPI_tuptable->vals[i];
+
+ /*
+ * Fetch tuple attributes
+ *
+ * Note: SPI_getvalue calls the output function for the type, so the
+ * string is allocated in our memory context and doesn't need copying.
+ */
+ tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_sysid"));
+
+ if (sscanf(tmp_sysid, UINT64_FORMAT, &cfg->sysid) != 1)
+ elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+ tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_timeline"),
+ &isnull);
+ Assert(!isnull);
+ cfg->timeline = DatumGetObjectId(tmp_datum);
+
+ tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_dboid"),
+ &isnull);
+ Assert(!isnull);
+ cfg->dboid = DatumGetObjectId(tmp_datum);
+
+ tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_is_unidirectional"),
+ &isnull);
+ Assert(!isnull);
+ cfg->is_unidirectional = DatumGetBool(tmp_datum);
+
+ tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("origin_is_my_id"),
+ &isnull);
+ Assert(!isnull);
+ cfg->origin_is_my_id = DatumGetBool(tmp_datum);
+
+
+ cfg->dsn = SPI_getvalue(tuple,
+ SPI_tuptable->tupdesc,
+ getattno("conn_dsn"));
+
+ tmp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_apply_delay"), &isnull);
+ if (isnull)
+ cfg->apply_delay = -1;
+ else
+ cfg->apply_delay = DatumGetInt32(tmp_datum);
+
+ /*
+ * Replication sets are stored in the catalogs as a text[]
+ * of identifiers, so we'll want to unpack that.
+ */
+
+ conn_replication_sets = (ArrayType*)
+ SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_replication_sets"), &isnull);
+
+ if (isnull)
+ cfg->replication_sets = NULL;
+ else
+ {
+ cfg->replication_sets =
+ bdr_textarr_to_identliststr(DatumGetArrayTypeP(conn_replication_sets));
+ }
+
+ configs = lcons(cfg, configs);
+
+ }
+
+ MemoryContextSwitchTo(saved_ctx);
+
+ SPI_finish();
+
+ MemoryContextSwitchTo(caller_ctx);
+
+ return configs;
+}
+
+void
+bdr_free_connection_config(BdrConnectionConfig *cfg)
+{
+ if (cfg->dsn != NULL)
+ pfree(cfg->dsn);
+ if (cfg->replication_sets != NULL)
+ pfree(cfg->replication_sets);
+}
+
+/*
+ * Fetch the connection configuration for the local node, i.e. the entry
+ * with our (conn_sysid, conn_tlid, conn_dboid).
+ */
+BdrConnectionConfig*
+bdr_get_connection_config(uint64 sysid, TimeLineID timeline, Oid dboid,
+ bool missing_ok)
+{
+ List *configs;
+ ListCell *lc;
+ MemoryContext saved_ctx;
+ BdrConnectionConfig *found_config = NULL;
+ bool tx_started = false;
+
+ Assert(MyDatabaseId != InvalidOid);
+
+ if (!IsTransactionState())
+ {
+ tx_started = true;
+ StartTransactionCommand();
+ }
+
+ saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+ configs = bdr_read_connection_configs();
+ MemoryContextSwitchTo(saved_ctx);
+
+ /*
+ * TODO DYNCONF Instead of reading all configs and then discarding all but
+ * the interesting one, we should really be doing a different query that
+ * returns only the configuration of interest. As this runs only during apply
+ * worker startup the impact is negligible.
+ */
+ foreach(lc, configs)
+ {
+ BdrConnectionConfig *cfg = (BdrConnectionConfig*) lfirst(lc);
+
+ if (cfg->sysid == sysid
+ && cfg->timeline == timeline
+ && cfg->dboid == dboid)
+ {
+ found_config = cfg;
+ break;
+ }
+ else
+ {
+ bdr_free_connection_config(cfg);
+ }
+ }
+
+ if (found_config == NULL && !missing_ok)
+ elog(ERROR, "Failed to find expected bdr.connections row "
+ "(conn_sysid,conn_timeline,conn_dboid) = "
+ "("UINT64_FORMAT",%u,%u) "
+ "in bdr.bdr_connections",
+ sysid, timeline, dboid);
+
+ if (tx_started)
+ CommitTransactionCommand();
+
+ list_free(configs);
+
+ return found_config;
+}
+
+
+static int
+getattno(const char *colname)
+{
+ int attno;
+
+ attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+ if (attno == SPI_ERROR_NOATTRIBUTE)
+ elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+ return attno;
+}
+
+/*
+ * Given a text[] Datum guaranteed to contain no nulls, return an
+ * identifier-quoted comma-separated string allocated in the current memory
+ * context.
+ */
+static char*
+bdr_textarr_to_identliststr(ArrayType *textarray)
+{
+ Datum *elems;
+ int nelems, i;
+ StringInfoData si;
+
+ deconstruct_array(textarray,
+ TEXTOID, -1, false, 'i',
+ &elems, NULL, &nelems);
+
+ if (nelems == 0)
+ return pstrdup("");
+
+ initStringInfo(&si);
+
+ appendStringInfoString(&si,
+ quote_identifier(TextDatumGetCString(elems[0])));
+ for (i = 1; i < nelems; i++)
+ {
+ appendStringInfoString(&si, ",");
+ appendStringInfoString(&si,
+ quote_identifier(TextDatumGetCString(elems[i])));
+ }
+
+ /*
+ * The stringinfo is on the stack, but its data element is palloc'd
+ * in the caller's context and can be returned safely.
+ */
+ return si.data;
+
+}
+
/*
* Helper to format node identity info into buffers, which must already be
* allocated and big enough to hold a unit64 + terminator (33 bytes).
--- /dev/null
+/*
+ * bdr_common.c
+ *
+ * BiDirectionalReplication
+ *
+ * Utility functions that can be share between extension and cli
+ * (don't require server side libraries).
+ *
+ * Copyright (c) 2015, PostgreSQL Global Development Group
+ *
+ * bdr_common.c
+ */
+
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "nodes/pg_list.h"
+
+#include "bdr_internal.h"
+
+
+/*
+ * Format slot name string from node identifiers.
+ */
+void
+bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+ Oid dboid, Oid local_dboid)
+{
+ char sysid_str[33];
+
+ snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, sysid);
+ sysid_str[sizeof(sysid_str)-1] = '\0';
+
+ snprintf(NameStr(*slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+ local_dboid, sysid_str, tlid, dboid,
+ EMPTY_REPLICATION_NAME);
+ NameStr(*slot_name)[NAMEDATALEN-1] = '\0';
+}
static Oid BdrConflictResolutionOid = InvalidOid;
static Oid BdrConflictHistorySeqId = InvalidOid;
-/*
- * All this code runs only in the context of an apply worker, so
- * we can access the apply worker state global safely
- */
-extern BdrApplyWorker *bdr_apply_worker;
-
#define BDR_CONFLICT_HISTORY_COLS 30
#define SYSID_DIGITS 33
bool performs_writes = false;
ListCell *l;
- if (bdr_always_allow_writes || !bdr_is_bdr_activated_db(MyDatabaseId))
+ if (bdr_always_allow_writes)
goto done;
/* identify whether this is a modifying statement */
if (!performs_writes)
goto done;
+ if (!bdr_is_bdr_activated_db(MyDatabaseId))
+ goto done;
+
#ifdef BUILDING_BDR
bdr_locks_check_query();
#endif
#include "postgres_fe.h"
+#include "getopt_long.h"
+
#include "port.h"
#include "libpq-fe.h"
typedef struct RemoteInfo {
uint64 sysid;
TimeLineID tlid;
- Oid dboid;
+ int numdbs;
+ Oid *dboids;
+ char **dbnames;
} RemoteInfo;
-static char *argv0 = NULL;
-static const char *progname;
-static uint64 system_identifier;
-static NameData restore_point_name;
-static char *data_dir = NULL;
-static char *config_options = "";
-static char pid_file[MAXPGPATH];
-static time_t start_time;
+typedef struct NodeInfo {
+ uint64 remote_sysid;
+ TimeLineID remote_tlid;
+ uint64 local_sysid;
+ TimeLineID local_tlid;
+} NodeInfo;
+
+typedef enum {
+ VERBOSITY_NORMAL,
+ VERBOSITY_VERBOSE,
+ VERBOSITY_DEBUG
+} VerbosityLevelEnum;
+
+static char *argv0 = NULL;
+static const char *progname;
+static char *data_dir = NULL;
+static char pid_file[MAXPGPATH];
+static time_t start_time;
+static VerbosityLevelEnum verbosity = VERBOSITY_NORMAL;
/* defined as static so that die() can close them */
static PGconn *local_conn = NULL;
static PGconn *remote_conn = NULL;
-BdrConnectionConfig **bdr_connection_configs;
-size_t bdr_connection_config_count;
-
static void signal_handler(int sig);
static void usage(void);
static void die(const char *fmt,...)
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
-static void print_msg(const char *fmt,...)
-__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
+static void print_msg(VerbosityLevelEnum level, const char *fmt,...)
+__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
-static int run_pg_ctl(const char *arg, const char *opts);
-static char *get_postgres_guc_value(char *guc, char *defval);
-static bool wait_postmaster_connection(void);
-static void wait_postgres_shutdown(void);
+static int run_pg_ctl(const char *arg);
+static void run_basebackup(const char *remote_connstr, const char *data_dir);
+static void wait_postmaster_connection(const char *connstr);
+static void wait_postmaster_shutdown(void);
-#ifdef BUILDING_UDR
-static void initialize_bdr(PGconn *conn);
-#endif
-static void remove_unwanted_state(void);
-static void initialize_replication_identifiers(char *remote_lsn);
-static void create_replication_identifier(PGconn *conn,
- const char *remote_ident, char *remote_lsn);
-static char *create_restore_point(char *remote_connstr);
-static void initialize_replication_slots(bool init_replica);
-static void create_replication_slot(PGconn *conn, Name slot_name);
-static RemoteInfo *get_remote_info(PGconn *conn, char* aux_connstr);
-static Oid get_dboid_from_dbname(PGconn *conn, const char* dbname);
+static void validate_remote_node(PGconn *conn);
+static void initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+ char *remote_connstr);
+static void remove_unwanted_files(void);
+static void remove_unwanted_data(PGconn *conn, char *dbname);
+static void initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn);
+static char *create_restore_point(PGconn *conn, char *restore_point_name);
+static void initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid);
+static void bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr);
+
+static RemoteInfo *get_remote_info(char* connstr);
+
+static void initialize_data_dir(char *data_dir, char *connstr,
+ char *postgresql_conf, char *pg_hba_conf);
static uint64 GenerateSystemIdentifier(void);
-static int set_sysid(void);
+static int set_sysid(uint64 sysid);
-static void read_bdr_config(void);
static void WriteRecoveryConf(PQExpBuffer contents);
+static void CopyConfFile(char *fromfile, char *tofile);
-static char *detect_local_conninfo(void);
-static char *detect_remote_conninfo(void);
-char *get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser);
-static char *PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values);
-static char *escapeConninfoValue(const char *val);
+char *get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser);
+static char *PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values);
+static void appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str);
-static bool parse_bool(const char *value, bool *result);
-static bool parse_bool_with_len(const char *value, size_t len, bool *result);
-static char *trimwhitespace(const char *str);
-static char **split_list_guc(char *str, size_t *count);
-
-static bool is_pg_dir(char *path);
+static bool file_exists(const char *path);
+static bool is_pg_dir(const char *path);
+static void copy_file(char *fromfile, char *tofile);
static char *find_other_exec_or_die(const char *argv0, const char *target, const char *versionstr);
static bool postmaster_is_alive(pid_t pid);
static long get_pgpid(void);
-static char **readfile(const char *path);
-static void free_readfile(char **optlines);
+static PGconn *
+connectdb(char *connstr, const char *dbname)
+{
+ PGconn *conn;
+ char *connstring = connstr;
+
+ /* TODO: deparse and reconstruct the connection string properly. */
+ if (dbname)
+ {
+ PQExpBuffer connbuf = createPQExpBuffer();
+
+ printfPQExpBuffer(connbuf, "%s dbname=", connstr);
+ appendPQExpBufferConnstrValue(connbuf, dbname);
+ connstring = pg_strdup(connbuf->data);
+ destroyPQExpBuffer(connbuf);
+ }
+
+ conn = PQconnectdb(connstring);
+ if (PQstatus(conn) != CONNECTION_OK)
+ die(_("Connection to database failed: %s, connection string was: %s\n"), PQerrorMessage(conn), connstring);
+
+ return conn;
+}
void signal_handler(int sig)
{
int i;
int c;
PQExpBuffer recoveryconfcontents = createPQExpBuffer();
- char *remote_lsn;
- bool hot_standby;
+ RemoteInfo *remote_info;
+ NodeInfo node_info;
+ char restore_point_name[NAMEDATALEN];
+ char *remote_lsn;
+ bool stop = false;
+ int optindex;
char *local_connstr = NULL;
+ char *local_dbhost = NULL,
+ *local_dbport = NULL,
+ *local_dbuser = NULL;
char *remote_connstr = NULL;
- char *dbhost = NULL,
- *dbport = NULL,
- *dbuser = NULL;
+ char *remote_dbhost = NULL,
+ *remote_dbport = NULL,
+ *remote_dbuser = NULL;
+ char *postgresql_conf = NULL,
+ *pg_hba_conf = NULL,
+ *recovery_conf = NULL;
+
+ static struct option long_options[] = {
+ {"pgdata", required_argument, NULL, 'D'},
+ {"remote-dbname", required_argument, NULL, 'd'},
+ {"remote-host", required_argument, NULL, 'h'},
+ {"remote-port", required_argument, NULL, 'p'},
+ {"remote-user", required_argument, NULL, 'U'},
+ {"local-dbname", required_argument, NULL, 2},
+ {"local-host", required_argument, NULL, 3},
+ {"local-port", required_argument, NULL, 4},
+ {"local-user", required_argument, NULL, 5},
+ {"postgresql-conf", required_argument, NULL, 6},
+ {"hba-conf", required_argument, NULL, 7},
+ {"recovery-conf", required_argument, NULL, 8},
+ {"stop", no_argument, NULL, 's'},
+ {NULL, 0, NULL, 0}
+ };
argv0 = argv[0];
progname = get_progname(argv[0]);
}
/* Option parsing and validation */
- while ((c = getopt(argc, argv, "D:d:h:o:p:U:")) != -1)
+ while ((c = getopt_long(argc, argv, "D:d:h:p:s:U:v", long_options, &optindex)) != -1)
{
switch (c)
{
case 'D':
data_dir = pg_strdup(optarg);
break;
- case 'o':
- config_options = pg_strdup(optarg);
- break;
case 'd':
remote_connstr = pg_strdup(optarg);
break;
case 'h':
- dbhost = pg_strdup(optarg);
+ remote_dbhost = pg_strdup(optarg);
break;
case 'p':
- dbport = pg_strdup(optarg);
+ remote_dbport = pg_strdup(optarg);
break;
case 'U':
- dbuser = pg_strdup(optarg);
+ remote_dbuser = pg_strdup(optarg);
+ break;
+ case 'v':
+ verbosity++;
+ break;
+ case 2:
+ local_connstr = pg_strdup(optarg);
+ break;
+ case 3:
+ local_dbhost = pg_strdup(optarg);
+ break;
+ case 4:
+ local_dbport = pg_strdup(optarg);
+ break;
+ case 5:
+ local_dbuser = pg_strdup(optarg);
+ break;
+ case 6:
+ {
+ postgresql_conf = pg_strdup(optarg);
+ if (postgresql_conf != NULL && !file_exists(postgresql_conf))
+ die(_("The specified postgresql.conf file does not exist."));
+ break;
+ }
+ case 7:
+ {
+ pg_hba_conf = pg_strdup(optarg);
+ if (pg_hba_conf != NULL && !file_exists(pg_hba_conf))
+ die(_("The specified pg_hba.conf file does not exist."));
+ break;
+ }
+ case 8:
+ {
+ recovery_conf = pg_strdup(optarg);
+ if (recovery_conf != NULL && !file_exists(recovery_conf))
+ die(_("The specified recovery.conf file does not exist."));
+ break;
+ }
+ case 's':
+ stop = true;
break;
default:
- fprintf(stderr, _("%s: unknown option\n"), progname);
+ fprintf(stderr, _("Unknown option\n"));
fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
exit(1);
}
if (data_dir == NULL)
{
- fprintf(stderr, _("%s: no data directory specified\n"), progname);
+ fprintf(stderr, _("No data directory specified\n"));
fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
exit(1);
}
- if (!is_pg_dir(data_dir))
- {
- die(_("%s: \"%s\" is not valid postgres data directory\n"), progname, data_dir);
- }
- snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
- print_msg(_("%s: starting...\n"), progname);
+ remote_connstr = get_connstr(remote_connstr, remote_dbhost, remote_dbport, remote_dbuser);
+ local_connstr = get_connstr(local_connstr, local_dbhost, local_dbport, local_dbuser);
+ if (!remote_connstr || !strlen(remote_connstr))
+ die(_("Remote connection must be specified.\n"));
+ if (!local_connstr || !strlen(local_connstr))
+ die(_("Local connection must be specified.\n"));
+
+ print_msg(VERBOSITY_NORMAL, _("%s: starting ...\n"), progname);
+
+ /*
+ * Generate new identifier for local node.
+ */
+ node_info.local_sysid = GenerateSystemIdentifier();
+ print_msg(VERBOSITY_VERBOSE,
+ _("Generated new local system identifier: "UINT64_FORMAT"\n"),
+ node_info.local_sysid);
+
+ /* Read the remote server indetification. */
+ print_msg(VERBOSITY_NORMAL,
+ _("Getting remote server identification ...\n"));
+ remote_info = get_remote_info(remote_connstr);
+
+ /* If there are no BDR enabled dbs, just bail. */
+ if (remote_info->numdbs < 1)
+ die(_("Remote node does not have any BDR enabled databases.\n"));
+
+ print_msg(VERBOSITY_NORMAL,
+ _("Detected %d BDR database(s) on remote server\n"),
+ remote_info->numdbs);
+
+ node_info.remote_sysid = remote_info->sysid;
+ node_info.remote_tlid = remote_info->tlid;
/*
- * Initialization
+ * Once the physical replication reaches the restore point, it will
+ * bump the timeline by one.
*/
- system_identifier = GenerateSystemIdentifier();
- print_msg(_("Assigning new system identifier: "UINT64_FORMAT"...\n"), system_identifier);
+ node_info.local_tlid = remote_info->tlid + 1;
- read_bdr_config();
+ print_msg(VERBOSITY_NORMAL,
+ _("Updating BDR configuration on the remote node:\n"));
- if (!remote_connstr && !dbhost && !dbport && !dbuser)
- remote_connstr = detect_remote_conninfo();
- else
- remote_connstr = get_conninfo(remote_connstr, dbhost, dbport, dbuser);
+ /* Initialize remote node. */
+ for (i = 0; i < remote_info->numdbs; i++)
+ {
+ char *dbname = remote_info->dbnames[i];
+ remote_conn = connectdb(remote_connstr, dbname);
- if (!remote_connstr || !strlen(remote_connstr))
- die(_("Could not detect remote connection\n"));
+ /*
+ * Make sure that we can use the remote node as init node.
+ */
+ print_msg(VERBOSITY_NORMAL,
+ _(" %s: validating BDR configuration ...\n"), dbname);
+ validate_remote_node(remote_conn);
+
+ /*
+ * Create replication slots on remote node.
+ */
+ print_msg(VERBOSITY_NORMAL,
+ _(" %s: creating replication slot ...\n"), dbname);
+ initialize_replication_slot(remote_conn, &node_info, remote_info->dboids[i]);
- local_connstr = detect_local_conninfo();
- if (local_connstr == NULL)
- die(_("Failed to detect local connection info. Please specify replica_local_dsn in the postgresql.conf.\n"));
+ /*
+ * Create node entry for future local node.
+ */
+ print_msg(VERBOSITY_NORMAL,
+ _(" %s: creating node entry for local node ...\n"), dbname);
+ initialize_node_entry(remote_conn, &node_info, remote_info->dboids[i],
+ remote_connstr);
- /* Hot standby would start cluster in read only mode, we don't want that. */
- if (!parse_bool(get_postgres_guc_value("hot_standby", NULL), &hot_standby))
- die(_("Invalid boolean value for configuration parameter \"hot_standby\"\n"));
- if (hot_standby)
- die(_("Cluster cannot be configured with hot_standby = on when using bdr\n"));
+ /* Don't hold connection since the next step might take long time. */
+ PQfinish(remote_conn);
+ local_conn = NULL;
+ }
- remove_unwanted_state();
+ /*
+ * Create basebackup or use existing one
+ */
+ initialize_data_dir(data_dir, remote_connstr, postgresql_conf, pg_hba_conf);
+ snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir);
/*
- * Initialization done, create replication slots to init node
- * and restore point on remote side.
+ * Create restore point to which we will catchup via physical replication.
*/
- print_msg(_("Creating primary replication slots...\n"));
- initialize_replication_slots(true);
+ remote_conn = PQconnectdb(remote_connstr);
+ if (PQstatus(remote_conn) != CONNECTION_OK)
+ die(_("Connection to remote node failed: %s"), PQerrorMessage(remote_conn));
+
+ print_msg(VERBOSITY_NORMAL, _("Creating restore point on remote node ...\n"));
+
+ snprintf(restore_point_name, NAMEDATALEN,
+ "bdr_"UINT64_FORMAT, node_info.local_sysid);
+ remote_lsn = create_restore_point(remote_conn, restore_point_name);
- print_msg(_("Creating restore point...\n"));
- snprintf(NameStr(restore_point_name), NAMEDATALEN,
- "bdr_"UINT64_FORMAT, system_identifier);
- remote_lsn = create_restore_point(remote_connstr);
+ PQfinish(remote_conn);
/*
* Get local db to consistent state (for lsn after slot creation).
*/
- print_msg(_("Bringing cluster to the restore point...\n"));
- appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
- appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", NameStr(restore_point_name));
+ print_msg(VERBOSITY_NORMAL,
+ _("Bringing local node to the restore point ...\n"));
+ if (recovery_conf)
+ {
+ CopyConfFile(recovery_conf, "recovery.conf");
+ }
+ else
+ {
+ appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n");
+ appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
+ }
+ appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", restore_point_name);
appendPQExpBuffer(recoveryconfcontents, "recovery_target_inclusive = true\n");
- appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr);
WriteRecoveryConf(recoveryconfcontents);
- run_pg_ctl("start -w -l \"bdr_init_copy_postgres.log\"",
-#ifdef BUILDING_BDR
- "-c shared_preload_libraries=''"
-#else
- ""
-#endif
- );
- if (!wait_postmaster_connection())
- die(_("Could not connect to local node"));
+ /*
+ * Start local node with BDR disabled, and wait until it starts accepting
+ * connections which means it has caught up to the restore point.
+ */
+ run_pg_ctl("start -l \"bdr_init_copy_postgres.log\" -o \"-c shared_preload_libraries=''\"");
+ wait_postmaster_connection(local_connstr);
/*
- * Postgres should have reached restore point and is accepting connections,
- * create slots to other nodes and local replication identifiers.
+ * Clean any per-node data that were copied by pg_basebackup.
*/
- local_conn = PQconnectdb(local_connstr);
- if (PQstatus(local_conn) != CONNECTION_OK)
- die(_("Connection to database failed: %s"), PQerrorMessage(local_conn));
-
-#ifdef BUILDING_UDR
- print_msg(_("Ensuring bdr extension is installed...\n"));
- initialize_bdr(remote_conn);
- initialize_bdr(local_conn);
-#endif
+ for (i = 0; i < remote_info->numdbs; i++)
+ {
+ local_conn = connectdb(local_connstr, remote_info->dbnames[i]);
+
+ remove_unwanted_data(local_conn, remote_info->dbnames[i]);
+
+ PQfinish(local_conn);
+ local_conn = NULL;
+ }
- print_msg(_("Creating secondary replication slots...\n"));
- initialize_replication_slots(false);
- print_msg(_("Creating local replication identifier...\n"));
- initialize_replication_identifiers(remote_lsn);
+ /* Stop Postgres so we can reset system id and start it with BDR loaded. */
+ run_pg_ctl("stop");
+ wait_postmaster_shutdown();
- PQfinish(local_conn);
- local_conn = NULL;
+ /*
+ * Individualize the local node by changing the system identifier.
+ */
+ set_sysid(node_info.local_sysid);
/*
- * Make this node functional as individual bdr node and start it.
+ * Start the node again, now with BDR active so that we can join the node
+ * to the BDR cluster. This is final start, so don't log to to special log
+ * file anymore.
*/
- run_pg_ctl("stop", "");
- wait_postgres_shutdown();
+ print_msg(VERBOSITY_NORMAL,
+ _("Initializing BDR on the local node:\n"));
+
+ run_pg_ctl("start -l \"bdr_init_copy_postgres.log\"");
+ wait_postmaster_connection(local_connstr);
+
+ for (i = 0; i < remote_info->numdbs; i++)
+ {
+ char *dbname = remote_info->dbnames[i];
+
+ local_conn = connectdb(local_connstr, dbname);
+
+ /*
+ * Create the identifier which is setup with the position to which we already
+ * caught up using physical replication.
+ */
+ print_msg(VERBOSITY_VERBOSE,
+ _(" %s: creating replication identifier ...\n"), dbname);
+ initialize_replication_identifier(local_conn, &node_info, remote_info->dboids[i], remote_lsn);
+
+ /*
+ * And finally add the node to the cluster.
+ */
+ print_msg(VERBOSITY_NORMAL,
+ _(" %s: adding the database to BDR cluster ...\n"), dbname);
+ bdr_node_start(local_conn, remote_connstr, local_connstr);
- set_sysid();
+ PQfinish(local_conn);
+ local_conn = NULL;
+ }
+
+ /* If user does not want the node to be running at the end, stop it. */
+ if (stop)
+ {
+ print_msg(VERBOSITY_NORMAL, _("Stopping the local node ...\n"));
+ run_pg_ctl("stop");
+ wait_postmaster_shutdown();
+ }
- print_msg(_("Starting the cluster...\n"));
- run_pg_ctl("start -w", "-c bdr.init_from_basedump=true");
+ print_msg(VERBOSITY_NORMAL, _("All done\n"));
return 0;
}
static void
usage(void)
{
- printf(_("%s initializes bdr from PostgreSQL instance made using pg_basebackup.\n\n"), progname);
- printf(_("pg_basebackup -X stream must be used to populate the data directory before\n"));
- printf(_("running %s to initialize BDR on it.\n\n"), progname);
+ printf(_("%s initializes new BDR node from existing BDR instance.\n\n"), progname);
printf(_("Usage:\n"));
printf(_(" %s [OPTION]...\n"), progname);
printf(_("\nGeneral options:\n"));
- printf(_(" -D, --pgdata=DIRECTORY base backup directory\n"));
- printf(_(" -o configuration options passed to pg_ctl's -o\n"));
+ printf(_(" -D, --pgdata=DIRECTORY data directory to be used for new nodem,\n"));
+ printf(_(" can be either empty/non-existing directory,\n"));
+ printf(_(" or directory populated using pg_basebackup -X stream\n"));
+ printf(_(" command\n"));
+ printf(_(" -s, --stop stop the server once the initialization is done\n"));
+ printf(_(" --postgresql-conf path to the new postgresql.conf\n"));
+ printf(_(" --hba-conf path to the new pg_hba.conf\n"));
+ printf(_(" --recovery-conf path to the template recovery.conf\n"));
printf(_("\nConnection options:\n"));
- printf(_(" -d, --dbname=CONNSTR connection string\n"));
- printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
- printf(_(" -p, --port=PORT database server port number\n"));
- printf(_(" -U, --username=NAME connect as specified database user\n"));
+ printf(_(" -d, --remote-dbname=CONNSTR\n"));
+ printf(_(" connection string for remote node\n"));
+ printf(_(" -h, --remote-host=HOSTNAME\n"));
+ printf(_(" server host or socket directory for remote node\n"));
+ printf(_(" -p, --remote-port=PORT server port number for remote node\n"));
+ printf(_(" -U, --remote-user=NAME connect as specified database user to the remote node\n"));
+ printf(_(" --local-dbname=CONNSTR connection string for local node\n"));
+ printf(_(" --local-host=HOSTNAME server host or socket directory for local node\n"));
+ printf(_(" --local-port=PORT server port number for local node\n"));
+ printf(_(" --local-user=NAME connect as specified database user to the local node\n"));
}
/*
vfprintf(stderr, fmt, argptr);
va_end(argptr);
- PQfinish(local_conn);
- PQfinish(remote_conn);
+ if (local_conn)
+ PQfinish(local_conn);
+ if (remote_conn)
+ PQfinish(remote_conn);
if (get_pgpid())
- run_pg_ctl("stop -s", "");
+ run_pg_ctl("stop -s");
exit(1);
}
* Print message to stdout and flush
*/
static void
-print_msg(const char *fmt,...)
+print_msg(VerbosityLevelEnum level, const char *fmt,...)
{
- va_list argptr;
- va_start(argptr, fmt);
- vfprintf(stdout, fmt, argptr);
- va_end(argptr);
- fflush(stdout);
+ if (verbosity >= level)
+ {
+ va_list argptr;
+ va_start(argptr, fmt);
+ vfprintf(stdout, fmt, argptr);
+ va_end(argptr);
+ fflush(stdout);
+ }
}
* Start pg_ctl with given argument(s) - used to start/stop postgres
*/
static int
-run_pg_ctl(const char *arg, const char *opts)
+run_pg_ctl(const char *arg)
{
int ret;
PQExpBuffer cmd = createPQExpBuffer();
char *exec_path = find_other_exec_or_die(argv0, "pg_ctl", "pg_ctl (PostgreSQL) " PG_VERSION "\n");
- appendPQExpBuffer(cmd, "%s %s -D \"%s\" -o \"%s %s\"", exec_path, arg, data_dir,
- opts, config_options);
+ appendPQExpBuffer(cmd, "%s %s -D \"%s\" -s", exec_path, arg, data_dir);
+
+ /* Run pg_ctl in silent mode unless we run in debug mode. */
+ if (verbosity < VERBOSITY_DEBUG)
+ appendPQExpBuffer(cmd, " -s");
+ print_msg(VERBOSITY_DEBUG, _("Running pg_ctl: %s.\n"), cmd->data);
ret = system(cmd->data);
destroyPQExpBuffer(cmd);
/*
- * Ugly way to read postgresql.conf
+ * Run pg_basebackup to create the copy of the origin node.
*/
-static char *
-get_postgres_guc_value(char *guc, char *defval)
+static void
+run_basebackup(const char *remote_connstr, const char *data_dir)
{
- FILE *fp;
- int status;
+ int ret;
PQExpBuffer cmd = createPQExpBuffer();
- char *exec_path = find_other_exec_or_die(argv0, "postgres", PG_BACKEND_VERSIONSTR);
- PQExpBuffer retbuf = createPQExpBuffer();
- char buf[8192];
- char *ret;
-
- printfPQExpBuffer(cmd, "%s -D \"%s\" %s -C \"%s\" 2>\"%s\"",
- exec_path, data_dir, config_options, guc, DEVNULL);
+ char *exec_path = find_other_exec_or_die(argv0, "pg_basebackup", "pg_basebackup (PostgreSQL) " PG_VERSION "\n");
- fp = popen(cmd->data, "r");
- while (fgets(buf, sizeof(buf), fp) != NULL)
- appendPQExpBufferStr(retbuf, buf);
+ appendPQExpBuffer(cmd, "%s -D \"%s\" -d \"%s\" -X s -P", exec_path, data_dir, remote_connstr);
- status = pclose(fp);
- destroyPQExpBuffer(cmd);
+ /* Run pg_basebackup in verbose mode if we are running in verbose mode. */
+ if (verbosity >= VERBOSITY_VERBOSE)
+ appendPQExpBuffer(cmd, " -v");
- if (status != 0)
- {
- destroyPQExpBuffer(retbuf);
- return defval;
- }
+ print_msg(VERBOSITY_DEBUG, _("Running pg_basebackup: %s.\n"), cmd->data);
+ ret = system(cmd->data);
- ret = trimwhitespace(retbuf->data);
- destroyPQExpBuffer(retbuf);
+ destroyPQExpBuffer(cmd);
- return ret;
+ if (ret != 0)
+ die(_("pg_basebackup failed, cannot continue.\n"));
}
/*
* Set system identifier to system id we used for registering the slots.
*/
static int
-set_sysid(void)
+set_sysid(uint64 sysid)
{
int ret;
PQExpBuffer cmd = createPQExpBuffer();
char *exec_path = find_other_exec_or_die(argv0, "bdr_resetxlog", "bdr_resetxlog (PostgreSQL) " PG_VERSION "\n");
- appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, system_identifier, data_dir);
+ appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, sysid, data_dir);
+ print_msg(VERBOSITY_DEBUG, _("Running bdr_resetxlog: %s.\n"), cmd->data);
ret = system(cmd->data);
destroyPQExpBuffer(cmd);
return ret;
}
-
-/*
- * Read bdr configuration
- *
- * This is somewhat ugly version of bdr_create_con_gucs and parts of _PG_init
- */
-static void
-read_bdr_config(void)
-{
- char *connections;
- char *errormsg = NULL;
- int connection_config_idx;
- size_t connection_count = 0;
- char **connames;
- PQconninfoOption *options;
- PQconninfoOption *cur_option;
-
- connections = get_postgres_guc_value("bdr.connections", NULL);
- if (!connections)
- die(_("bdr.connections is empty\n"));
-
- connames = split_list_guc(connections, &connection_count);
- pg_free(connections);
-
- bdr_connection_config_count = connection_count;
- bdr_connection_configs = (BdrConnectionConfig**)
- pg_malloc0(bdr_connection_config_count * sizeof(BdrConnectionConfig*));
-
- for (connection_config_idx = 0; connection_config_idx < connection_count; connection_config_idx++)
- {
- char *name = (char *) connames[connection_config_idx];
- char *optname_dsn = pg_malloc(strlen(name) + 30);
- char *optname_local_dsn = pg_malloc(strlen(name) + 30);
- char *optname_replica = pg_malloc(strlen(name) + 30);
- char *optname_local_dbname = pg_malloc(strlen(name) + 30);
- BdrConnectionConfig *opts;
-
- sprintf(optname_dsn, "bdr.%s_dsn", name);
- sprintf(optname_local_dsn, "bdr.%s_replica_local_dsn", name);
- sprintf(optname_replica, "bdr.%s_init_replica", name);
- sprintf(optname_local_dbname, "bdr.%s_local_dbname", name);
-
- opts = pg_malloc0(sizeof(BdrConnectionConfig));
- opts->name = pg_strdup(name);
- opts->is_valid = false;
-
- bdr_connection_configs[connection_config_idx] = opts;
-
- opts->dsn = get_postgres_guc_value(optname_dsn, NULL);
- if (!opts->dsn)
- continue;
-
- opts->replica_local_dsn = get_postgres_guc_value(optname_local_dsn, NULL);
-
- if (!parse_bool(get_postgres_guc_value(optname_replica, "false"), &opts->init_replica))
- die(_("Invalid boolean value for configuration parameter \"%s\"\n"), optname_replica);
-
- opts->dbname = get_postgres_guc_value(optname_local_dbname, NULL);
-
- options = PQconninfoParse(opts->dsn, &errormsg);
- if (errormsg != NULL)
- {
- char *str = pg_strdup(errormsg);
-
- PQfreemem(errormsg);
- die(_("bdr %s: error in dsn: %s\n"), name, str);
- }
-
- if (opts->dbname == NULL)
- {
- cur_option = options;
- while (cur_option->keyword != NULL)
- {
- if (strcmp(cur_option->keyword, "dbname") == 0)
- {
- if (cur_option->val == NULL)
- die(_("bdr %s: no dbname set\n"), name);
-
- opts->dbname = pg_strdup(cur_option->val);
- }
- cur_option++;
- }
- }
-
-
- opts->is_valid = true;
-
- /* cleanup */
- PQconninfoFree(options);
- }
-}
-
-
-
/*
* Cleans everything that was replicated via basebackup but we don't want it.
*/
static void
-remove_unwanted_state(void)
+remove_unwanted_files(void)
{
#ifdef BUILDING_BDR
DIR *lldir;
printfPQExpBuffer(llpath, "%s/%s", data_dir, LLOGCDIR);
+ print_msg(VERBOSITY_DEBUG, _("Removing data from \"%s\" directory.\n"),
+ llpath->data);
+
/*
* Remove stray logical replication checkpoints
*/
#endif
}
-
/*
- * Initialize replication slots
+ * Init the datadir
*
- * Get connection configs from bdr and use the info
- * to register replication slots for future use.
+ * This function can either ensure provided datadir is a postgres datadir,
+ * or create it using pg_basebackup.
+ *
+ * In any case, new postresql.conf and pg_hba.conf will be copied to the
+ * datadir if they are provided.
*/
static void
-initialize_replication_slots(bool init_replica)
+initialize_data_dir(char *data_dir, char *connstr,
+ char *postgresql_conf, char *pg_hba_conf)
{
- int i;
-
- for (i = 0; i < bdr_connection_config_count; i++)
+ /* Run basebackup as needed. */
+ switch (pg_check_dir(data_dir))
{
- NameData slot_name;
- char remote_ident[256];
- RemoteInfo *ri;
- TimeLineID tlid;
- Oid dboid;
- char system_identifier_s[32];
- BdrConnectionConfig *cfg = bdr_connection_configs[i];
- PQExpBuffer conninfo = createPQExpBuffer();
-
- if (!cfg || !cfg->is_valid || cfg->init_replica != init_replica)
- continue;
-
- printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
- remote_conn = PQconnectdb(conninfo->data);
- destroyPQExpBuffer(conninfo);
-
- if (PQstatus(remote_conn) != CONNECTION_OK)
- {
- die(_("Could not connect to the remote server: %s\n"),
- PQerrorMessage(remote_conn));
- }
-
- ri = get_remote_info(remote_conn, cfg->dsn);
- dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
-
- /* XXX: this might break if timeline switch happens in meantime */
- tlid = cfg->init_replica ? ri->tlid + 1 : ri->tlid;
-
- snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, system_identifier);
- snprintf(NameStr(slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
- ri->dboid, system_identifier_s, tlid,
- dboid, "");
- NameStr(slot_name)[NAMEDATALEN - 1] = '\0';
-
- create_replication_slot(remote_conn, &slot_name);
+ case 0: /*Does not exist */
+ case 1: /* Exists, empty */
+ {
+ if (connstr)
+ {
+ print_msg(VERBOSITY_NORMAL,
+ _("Creating base backup of the remote node...\n"));
+ run_basebackup(connstr, data_dir);
+ }
+ else
+ die(_("Directory \"%s\" does not exist.\n"),
+ data_dir);
+ break;
+ }
+ case 2:
+ case 3: /* Exists, not empty */
+ case 4:
+ {
+ if (!is_pg_dir(data_dir))
+ die(_("Directory \"%s\" exists but is not valid postgres data directory.\n"),
+ data_dir);
+ break;
+ }
+ case -1: /* Access problem */
+ die(_("Could not access directory \"%s\": %s.\n"),
+ data_dir, strerror(errno));
+ }
- PQfinish(remote_conn);
- remote_conn = NULL;
+ remove_unwanted_files();
- snprintf(remote_ident, sizeof(remote_ident),
- BDR_NODE_ID_FORMAT,
- ri->sysid, ri->tlid, ri->dboid, dboid,
- "");
- }
+ if (postgresql_conf)
+ CopyConfFile(postgresql_conf, "postgresql.conf");
+ if (pg_hba_conf)
+ CopyConfFile(pg_hba_conf, "pg_hba.conf");
}
/*
- * Get database Oid of the remotedb.
+ * Initialize replication slots
*
- * Can't use the bdr_get_remote_dboid because it needs elog :(
+ * Get connection configs from bdr and use the info
+ * to register replication slots for future use.
*/
-static Oid
-get_remote_dboid(char *conninfo_db)
+static void
+initialize_replication_slot(PGconn *conn, NodeInfo *ni, Oid dboid)
{
- PGconn *dbConn;
+ char slotname[NAMEDATALEN];
+ char system_identifier_s[32];
+ PQExpBuffer query = createPQExpBuffer();
PGresult *res;
- char *remote_dboid;
- Oid remote_dboid_i;
-
- dbConn = PQconnectdb(conninfo_db);
- if (PQstatus(dbConn) != CONNECTION_OK)
- {
- die(_("Could not connect to the primary server: %s"), PQerrorMessage(dbConn));
- }
- res = PQexec(dbConn, "SELECT oid FROM pg_database WHERE datname = current_database()");
- if (PQresultStatus(res) != PGRES_TUPLES_OK)
- die(_("Could fetch database oid: %s"), PQerrorMessage(dbConn));
+ snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, ni->local_sysid);
+ snprintf(slotname, NAMEDATALEN, BDR_SLOT_NAME_FORMAT,
+ dboid, system_identifier_s, ni->local_tlid, dboid, "");
+ appendPQExpBuffer(query, "SELECT pg_create_logical_replication_slot(%s, '%s');",
+ PQescapeLiteral(conn, slotname, NAMEDATALEN), "bdr");
- if (PQntuples(res) != 1 || PQnfields(res) != 1)
- die(_("Could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"),
- PQntuples(res), PQnfields(res), 1, 1);
+ res = PQexec(conn, query->data);
- remote_dboid = PQgetvalue(res, 0, 0);
- if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
- die(_("could not parse remote database OID %s"), remote_dboid);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ die(_("Could not create replication slot, status %s: %s\n"),
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
PQclear(res);
- PQfinish(dbConn);
-
- return remote_dboid_i;
+ destroyPQExpBuffer(query);
}
/*
* Read replication info about remote connection
*/
static RemoteInfo *
-get_remote_info(PGconn *conn, char* aux_connstr)
+get_remote_info(char* remote_connstr)
{
- RemoteInfo *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
+ RemoteInfo *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo));
char *remote_sysid;
char *remote_tlid;
- char *remote_dboid;
+ int i;
PGresult *res;
+ PQExpBuffer conninfo = createPQExpBuffer();
+
+ /*
+ * Fetch the system identification info (sysid, tlid) via replication
+ * connection - there is no way to get this info via SQL.
+ */
+ printfPQExpBuffer(conninfo, "%s replication=database", remote_connstr);
+ remote_conn = PQconnectdb(conninfo->data);
+ destroyPQExpBuffer(conninfo);
+
+ if (PQstatus(remote_conn) != CONNECTION_OK)
+ {
+ die(_("Could not connect to the remote server: %s\n"),
+ PQerrorMessage(remote_conn));
+ }
- res = PQexec(conn, "IDENTIFY_SYSTEM");
+ res = PQexec(remote_conn, "IDENTIFY_SYSTEM");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
PQclear(res);
die(_("Could not send replication command \"%s\": %s\n"),
- "IDENTIFY_SYSTEM", PQerrorMessage(conn));
+ "IDENTIFY_SYSTEM", PQerrorMessage(remote_conn));
}
if (PQntuples(res) != 1 || PQnfields(res) < 4 || PQnfields(res) > 5)
remote_sysid = PQgetvalue(res, 0, 0);
remote_tlid = PQgetvalue(res, 0, 1);
- if (PQnfields(res) == 5)
- {
- remote_dboid = PQgetvalue(res, 0, 4);
- if (sscanf(remote_dboid, "%u", &ri->dboid) != 1)
- die(_("could not parse remote database OID %s"), remote_dboid);
- }
- else
- {
- ri->dboid = get_remote_dboid(aux_connstr);
- }
-
#ifdef HAVE_STRTOULL
ri->sysid = strtoull(remote_sysid, NULL, 10);
#else
die(_("Could not parse remote tlid %s\n"), remote_tlid);
PQclear(res);
+ PQfinish(remote_conn);
+ remote_conn = NULL;
- return ri;
-}
+ /*
+ * Fetch list of BDR enabled databases via standard SQL connection.
+ */
+ remote_conn = PQconnectdb(remote_connstr);
+ if (PQstatus(remote_conn) != CONNECTION_OK)
+ {
+ die(_("Could not connect to the remote server: %s"), PQerrorMessage(remote_conn));
+ }
-/*
- * Get dboid based on dbname
- */
-static Oid
-get_dboid_from_dbname(PGconn *conn, const char* dbname)
-{
- char *dboid_str;
- Oid dboid;
- PQExpBuffer query = createPQExpBuffer();
- PGresult *res;
+ res = PQexec(remote_conn, "SELECT d.oid, d.datname "
+ "FROM pg_catalog.pg_database d, pg_catalog.pg_shseclabel l "
+ "WHERE l.provider = 'bdr' "
+ " AND l.classoid = 'pg_database'::regclass "
+ " AND d.oid = l.objoid;");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ die(_("Could fetch remote database list: %s"), PQerrorMessage(remote_conn));
- appendPQExpBuffer(query, "SELECT oid FROM pg_catalog.pg_database WHERE datname = '%s'",
- dbname);
+ ri->numdbs = PQntuples(res);
+ ri->dboids = (Oid *) pg_malloc(ri->numdbs * sizeof(Oid));
+ ri->dbnames = (char **) pg_malloc(ri->numdbs * sizeof(char *));
- res = PQexec(conn, query->data);
- if (PQresultStatus(res) != PGRES_TUPLES_OK || PQntuples(res) != 1)
+ for (i = 0; i < ri->numdbs; i++)
{
- PQclear(res);
- die(_("Could not get database id for \"%s\": %s\n"),
- dbname, PQerrorMessage(conn));
- }
+ char *remote_dboid = PQgetvalue(res, i, 0);
+ char *remote_dbname = PQgetvalue(res, i, 1);
+ Oid remote_dboid_i;
- dboid_str = PQgetvalue(res, 0, 0);
- if (sscanf(dboid_str, "%u", &dboid) != 1)
- die(_("Could not parse database OID %s\n"), dboid_str);
+ if (sscanf(remote_dboid, "%u", &remote_dboid_i) != 1)
+ die(_("Could not parse database OID %s"), remote_dboid);
+
+ ri->dboids[i] = remote_dboid_i;
+ ri->dbnames[i] = pstrdup(remote_dbname);
+ }
PQclear(res);
- destroyPQExpBuffer(query);
- return dboid;
+ PQfinish(remote_conn);
+ remote_conn = NULL;
+
+ return ri;
}
+
/*
- * Create replication slot
+ * Check if extension exists.
*/
-static void
-create_replication_slot(PGconn *conn, Name slot_name)
+static bool
+extension_exists(PGconn *conn, const char *extname)
{
- PQExpBuffer query = createPQExpBuffer();
- PGresult *res;
-
- appendPQExpBuffer(query, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s",
- NameStr(*slot_name), "bdr");
+ PQExpBuffer query = createPQExpBuffer();
+ PGresult *res;
+ bool ret;
+ printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
+ PQescapeLiteral(conn, extname, strlen(extname)));
res = PQexec(conn, query->data);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- die(_("Could not send replication command \"%s\": status %s: %s\n"),
- query->data,
- PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ PQclear(res);
+ die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
}
+ ret = PQntuples(res) == 1;
+
PQclear(res);
destroyPQExpBuffer(query);
+
+ return ret;
}
-#ifdef BUILDING_UDR
+/*
+ * Create extension.
+ */
static void
-install_extension_if_not_exists(PGconn *conn, const char *extname)
+install_extension(PGconn *conn, const char *extname)
{
PQExpBuffer query = createPQExpBuffer();
PGresult *res;
- printfPQExpBuffer(query, "SELECT 1 FROM pg_catalog.pg_extension WHERE extname = %s;",
- PQescapeLiteral(conn, extname, strlen(extname)));
+ printfPQExpBuffer(query, "CREATE EXTENSION %s;",
+ PQescapeIdentifier(conn, extname, strlen(extname)));
res = PQexec(conn, query->data);
- if (PQresultStatus(res) != PGRES_TUPLES_OK)
- {
- PQclear(res);
- die(_("Could not read extension info: %s\n"), PQerrorMessage(conn));
- }
-
- if (PQntuples(res) != 1)
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
PQclear(res);
-
- printfPQExpBuffer(query, "CREATE EXTENSION %s;",
- PQescapeIdentifier(conn, extname, strlen(extname)));
- res = PQexec(conn, query->data);
-
- if (PQresultStatus(res) != PGRES_COMMAND_OK)
- {
- PQclear(res);
- die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
- }
+ die(_("Could not install %s extension: %s\n"), extname, PQerrorMessage(conn));
}
PQclear(res);
}
/*
- * Initialize bdr extension (if not already initialized).
- *
- * Should have similar logic as bdr_maintain_schema in bdr.c.
+ * Validate that BDR extension is installed on remote node
+ * and that there is at least one BDR node entry present.
*/
static void
-initialize_bdr(PGconn *conn)
+validate_remote_node(PGconn *conn)
{
- install_extension_if_not_exists(conn, "btree_gist");
- install_extension_if_not_exists(conn,"bdr");
-}
-#endif
+ PGresult *res;
+ PQExpBuffer query = createPQExpBuffer();
-/*
- * Initialize new remote identifiers to specific position.
- */
-static void
-initialize_replication_identifiers(char *remote_lsn)
-{
- int i;
- PGresult *res;
+ if (!extension_exists(conn, "bdr"))
+ die(_("The BDR extension must be installed on remote node.\n"));
- /* Remove replication identifiers */
- res = PQexec(local_conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+#ifdef BUILDING_BDR
+ res = PQexec(conn, "SELECT 1 FROM bdr.bdr_nodes;");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
PQclear(res);
- die(_("Could not remove replication identifier: %s\n"), PQerrorMessage(local_conn));
+ die(_("Could fetch BDR info: %s\n"), PQerrorMessage(conn));
}
- /* Initialize new replication identifiers */
- for (i = 0; i < bdr_connection_config_count; i++)
- {
- char remote_ident[256];
- Oid dboid;
- RemoteInfo *ri;
- BdrConnectionConfig *cfg = bdr_connection_configs[i];
- PQExpBuffer conninfo = createPQExpBuffer();
+ if (PQntuples(res) < 1)
+ die(_("The remote node is not configured as a BDR node.\n"));
- if (!cfg || !cfg->is_valid)
- continue;
+ PQclear(res);
+#endif
- printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn);
- remote_conn = PQconnectdb(conninfo->data);
- destroyPQExpBuffer(conninfo);
+ destroyPQExpBuffer(query);
+}
- if (PQstatus(remote_conn) != CONNECTION_OK)
- {
- die(_("Could not connect to the remote server: %s\n"),
- PQerrorMessage(remote_conn));
- }
- ri = get_remote_info(remote_conn, cfg->dsn);
- dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname);
+/*
+ * Insert node entry for local node to the remote's bdr_nodes.
+ */
+void
+initialize_node_entry(PGconn *conn, NodeInfo *ni, Oid dboid,
+ char *remote_connstr)
+{
+ PQExpBuffer query = createPQExpBuffer();
+ PGresult *res;
- PQfinish(remote_conn);
- remote_conn = NULL;
+ printfPQExpBuffer(query, "INSERT INTO bdr.bdr_nodes"
+ " (node_status, node_sysid, node_timeline,"
+ " node_dboid, node_init_from_dsn)"
+ " VALUES ('c', '"UINT64_FORMAT"', %u, %u, %s);",
+ ni->local_sysid, ni->local_tlid, dboid,
+ PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+ res = PQexec(conn, query->data);
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ PQclear(res);
+ die(_("Failed to insert row into bdr.bdr_nodes: %s\n"), PQerrorMessage(conn));
+ }
+
+ PQclear(res);
+ destroyPQExpBuffer(query);
+}
+
+/*
+ * Clean all the data that was copied from remote node but we don't
+ * want it here (currently shared security labels and replication identifiers).
+ */
+static void
+remove_unwanted_data(PGconn *conn, char *dbname)
+{
+ PGresult *res;
- snprintf(remote_ident, sizeof(remote_ident),
- BDR_NODE_ID_FORMAT,
- ri->sysid, ri->tlid, ri->dboid, dboid,
- "");
+ /* Remove any BDR security labels. */
+ res = PQexec(conn, "DELETE FROM pg_catalog.pg_shseclabel WHERE provider = 'bdr';");
- create_replication_identifier(local_conn, remote_ident,
- cfg->init_replica ? remote_lsn : NULL);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ PQclear(res);
+ die(_("Could not update security label: %s\n"), PQerrorMessage(conn));
+ }
+
+ /* Remove replication identifiers. */
+ res = PQexec(conn, "SELECT "RIINTERFACE_PREFIX"replication_identifier_drop(riname) FROM "RIINTERFACE_PREFIX"replication_identifier;");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ PQclear(res);
+ die(_("Could not remove existing replication identifiers: %s\n"), PQerrorMessage(conn));
}
+ PQclear(res);
}
/*
- * Create local replication identifier
+ * Initialize new remote identifier to specific position.
*/
static void
-create_replication_identifier(PGconn *conn, const char *remote_ident, char *remote_lsn)
+initialize_replication_identifier(PGconn *conn, NodeInfo *ni, Oid dboid, char *remote_lsn)
{
- PQExpBuffer query = createPQExpBuffer();
PGresult *res;
+ char remote_ident[256];
+ PQExpBuffer query = createPQExpBuffer();
+
+ snprintf(remote_ident, sizeof(remote_ident), BDR_NODE_ID_FORMAT,
+ ni->remote_sysid, ni->remote_tlid, dboid, dboid, "");
printfPQExpBuffer(query, "SELECT "RIINTERFACE_PREFIX"replication_identifier_create('%s')",
remote_ident);
* state through physical replay.
*/
static char *
-create_restore_point(char *remote_connstr)
+create_restore_point(PGconn *conn, char *restore_point_name)
{
PQExpBuffer query = createPQExpBuffer();
PGresult *res;
char *remote_lsn = NULL;
- remote_conn = PQconnectdb(remote_connstr);
- if (PQstatus(remote_conn) != CONNECTION_OK)
- {
- die(_("Could not connect to the remote server: %s\n"),
- PQerrorMessage(remote_conn));
- }
-
- printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", NameStr(restore_point_name));
- res = PQexec(remote_conn, query->data);
+ printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", restore_point_name);
+ res = PQexec(conn, query->data);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- die(_("Could not create restore point \"%s\": status %s: %s\n"),
- query->data,
+ die(_("Could not create restore point, status %s: %s\n"),
PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
}
remote_lsn = pstrdup(PQgetvalue(res, 0, 0));
PQclear(res);
- PQfinish(remote_conn);
- remote_conn = NULL;
destroyPQExpBuffer(query);
return remote_lsn;
}
-static char *
-detect_local_conninfo(void)
-{
- int i;
- for (i = 0; i < bdr_connection_config_count; i++)
- {
- BdrConnectionConfig *cfg = bdr_connection_configs[i];
-
- if (!cfg || !cfg->is_valid || !cfg->init_replica ||
- !cfg->replica_local_dsn)
- continue;
-
- return pg_strdup(cfg->replica_local_dsn);
- }
-
- return NULL;
-}
-
-static char *
-detect_remote_conninfo(void)
+static void
+bdr_node_start(PGconn *conn, char *remote_connstr, char *local_connstr)
{
- int i;
+ PQExpBuffer query = createPQExpBuffer();
+ PGresult *res;
- for (i = 0; i < bdr_connection_config_count; i++)
- {
- BdrConnectionConfig *cfg = bdr_connection_configs[i];
+ /* Install required extensions if needed. */
+ if (!extension_exists(conn, "btree_gist"))
+ install_extension(conn, "btree_gist");
+ if (!extension_exists(conn, "bdr"))
+ install_extension(conn, "bdr");
- if (!cfg || !cfg->is_valid || !cfg->init_replica)
- continue;
+ /* Add the node to the cluster. */
+#ifdef BUILDING_BDR
+ printfPQExpBuffer(query, "SELECT bdr.bdr_group_join(%s, %s);",
+ PQescapeLiteral(conn, local_connstr, strlen(local_connstr)),
+ PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)));
+#else
+ printfPQExpBuffer(query, "SELECT bdr.bdr_subscribe(%s, %s);",
+ PQescapeLiteral(conn, remote_connstr, strlen(remote_connstr)),
+ PQescapeLiteral(conn, local_connstr, strlen(local_connstr)));
+#endif
- return pg_strdup(cfg->dsn);
+ res = PQexec(conn, query->data);
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ die(_("Could not add local node to cluster, status %s: %s\n"),
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
}
- return NULL;
+ PQclear(res);
+ destroyPQExpBuffer(query);
}
+/*
+ * Build connection string from individual parameter.
+ *
+ * This function also handles case where full connection string was
+ * specified instead of dbname.
+ */
char *
-get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser)
+get_connstr(char *dbname, char *dbhost, char *dbport, char *dbuser)
{
char *ret;
int argcount = 4; /* dbname, host, user, port */
{
keywords = pg_malloc0((argcount + 1) * sizeof(*keywords));
values = pg_malloc0((argcount + 1) * sizeof(*values));
-
- keywords[i] = "dbname";
- values[i] = dbname == NULL ? "postgres" : dbname;
- i++;
}
if (dbhost)
i++;
}
- ret = PQconninfoParams_to_conninfo(keywords, values);
+ ret = PQconninfoParamsToConnstr(keywords, values);
/* Connection ok! */
pg_free(values);
fclose(cf);
}
+/*
+ * Copy file to data
+ */
+static void
+CopyConfFile(char *fromfile, char *tofile)
+{
+ char filename[MAXPGPATH];
+
+ sprintf(filename, "%s/%s", data_dir, tofile);
+
+ print_msg(VERBOSITY_DEBUG, _("Copying \"%s\" to \"%s\".\n"),
+ fromfile, filename);
+ copy_file(fromfile, filename);
+}
+
+
/*
* Convert PQconninfoOption array into conninfo string
*/
static char *
-PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values)
+PQconninfoParamsToConnstr(const char *const * keywords, const char *const * values)
{
PQExpBuffer retbuf = createPQExpBuffer();
char *ret;
int i = 0;
- while (keywords[i])
+ for (i = 0; keywords[i] != NULL; i++)
{
- char *tmpval = escapeConninfoValue(values[i]);
- appendPQExpBuffer(retbuf, "%s = '%s' ", keywords[i], tmpval);
- pg_free(tmpval);
- i++;
+ if (i > 0)
+ appendPQExpBufferChar(retbuf, ' ');
+ appendPQExpBuffer(retbuf, "%s=", keywords[i]);
+ appendPQExpBufferConnstrValue(retbuf, values[i]);
}
ret = pg_strdup(retbuf->data);
/*
* Escape connection info value
*/
-static char *
-escapeConninfoValue(const char *val)
+static void
+appendPQExpBufferConnstrValue(PQExpBuffer buf, const char *str)
{
- int i, j;
- char *ret = pg_malloc(strlen(val) * 2 + 1);
+ const char *s;
+ bool needquotes;
- j = 0;
- for (i = 0; i < strlen(val); i++)
+ /*
+ * If the string consists entirely of plain ASCII characters, no need to
+ * quote it. This is quite conservative, but better safe than sorry.
+ */
+ needquotes = false;
+ for (s = str; *s; s++)
{
- switch (val[i])
+ if (!((*s >= 'a' && *s <= 'z') || (*s >= 'A' && *s <= 'Z') ||
+ (*s >= '0' && *s <= '9') || *s == '_' || *s == '.'))
{
- case '\\':
- case '\'':
- ret[j++] = '\\';
- default:
- break;
+ needquotes = true;
+ break;
}
-
- ret[j++] = val[i];
}
- ret[j] = '\0';
-
- return ret;
-}
-
-
-/*
- * Taken from adt/bool.c
- *
- * Try to interpret value as boolean value. Valid values are: true,
- * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof.
- * If the string parses okay, return true, else false.
- * If okay and result is not NULL, return the value in *result.
- */
-static bool
-parse_bool(const char *value, bool *result)
-{
- return parse_bool_with_len(value, strlen(value), result);
-}
-
-static bool
-parse_bool_with_len(const char *value, size_t len, bool *result)
-{
- switch (*value)
+ if (needquotes)
{
- case 't':
- case 'T':
- if (pg_strncasecmp(value, "true", len) == 0)
- {
- if (result)
- *result = true;
- return true;
- }
- break;
- case 'f':
- case 'F':
- if (pg_strncasecmp(value, "false", len) == 0)
- {
- if (result)
- *result = false;
- return true;
- }
- break;
- case 'y':
- case 'Y':
- if (pg_strncasecmp(value, "yes", len) == 0)
- {
- if (result)
- *result = true;
- return true;
- }
- break;
- case 'n':
- case 'N':
- if (pg_strncasecmp(value, "no", len) == 0)
- {
- if (result)
- *result = false;
- return true;
- }
- break;
- case 'o':
- case 'O':
- /* 'o' is not unique enough */
- if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0)
- {
- if (result)
- *result = true;
- return true;
- }
- else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0)
- {
- if (result)
- *result = false;
- return true;
- }
- break;
- case '1':
- if (len == 1)
- {
- if (result)
- *result = true;
- return true;
- }
- break;
- case '0':
- if (len == 1)
- {
- if (result)
- *result = false;
- return true;
- }
- break;
- default:
- break;
- }
-
- if (result)
- *result = false; /* suppress compiler warning */
- return false;
-}
-
-/*
- * Remove leading and trailing whitespace from the string,
- * does not change input
- */
-static char *
-trimwhitespace(const char *str)
-{
- const char *end;
- char *res;
- size_t len;
-
- while(isspace(*str))
- str++;
-
- if(*str == 0)
- return NULL;
-
- end = str + strlen(str) - 1;
- while(end > str && isspace(*end))
- end--;
-
- len = end-str;
- if (!len)
- return NULL;
-
- len++;
- res = pg_malloc(len+1);
- memcpy(res, str, len);
- res[len] = '\0';
-
- return res;
-}
-
-/*
- * Split guc list paramenter into array
- * Note that this is not 100% compatible with that is in core
- * but seems good enough for our purposes
- */
-static char **
-split_list_guc(char *str, size_t *count)
-{
- char **ret = NULL;
- char *t = strtok (str, ",");
- size_t i = 0;
-
- while (t) {
- ret = realloc(ret, sizeof(char*)* ++i);
-
- if (ret == NULL)
- die(_("Out of memory\n"));
-
- t = trimwhitespace(t);
- if (!t)
- die(_("Bad input for list: %s\n"), str);
-
- ret[i-1] = t;
+ appendPQExpBufferChar(buf, '\'');
+ while (*str)
+ {
+ /* ' and \ must be escaped by to \' and \\ */
+ if (*str == '\'' || *str == '\\')
+ appendPQExpBufferChar(buf, '\\');
- t = strtok(NULL, ",");
+ appendPQExpBufferChar(buf, *str);
+ str++;
+ }
+ appendPQExpBufferChar(buf, '\'');
}
-
- *count = i;
- return ret;
+ else
+ appendPQExpBufferStr(buf, str);
}
/*
* Find the pgport and try a connection
- *
- * Based on pg_ctl.c:test_postmaster_connection
*/
-static bool
-wait_postmaster_connection(void)
+static void
+wait_postmaster_connection(const char *connstr)
{
PGPing res;
- long pm_pid = 0;
- char connstr[MAXPGPATH * 2 + 256];
+ long pmpid = 0;
- connstr[0] = '\0';
+ print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to accept connections ...");
+ /* First wait for Postmaster to come up. */
for (;;)
{
- /* Do we need a connection string? */
- if (connstr[0] == '\0')
- {
- /*----------
- * The number of lines in postmaster.pid tells us several things:
- *
- * # of lines
- * 0 lock file created but status not written
- * 2 pre-9.1 server, shared memory not created
- * 3 pre-9.1 server, shared memory created
- * 5 9.1+ server, ports not opened
- * 6 9.1+ server, shared memory not created
- * 7 9.1+ server, shared memory created
- *
- * If we see less than 6 lines in postmaster.pid, just keep
- * waiting.
- *----------
- */
- char **optlines;
-
- /* Try to read the postmaster.pid file */
- if ((optlines = readfile(pid_file)) != NULL &&
- optlines[0] != NULL &&
- optlines[1] != NULL &&
- optlines[2] != NULL &&
- optlines[3] != NULL &&
- optlines[4] != NULL &&
- optlines[5] != NULL)
- {
- /* File is complete enough for us, parse it */
- long pmpid;
- time_t pmstart;
-
- /*
- * Make sanity checks. If it's for a standalone backend
- * (negative PID), or the recorded start time is before
- * pg_ctl started, then either we are looking at the wrong
- * data directory, or this is a pre-existing pidfile that
- * hasn't (yet?) been overwritten by our child postmaster.
- * Allow 2 seconds slop for possible cross-process clock
- * skew.
- */
- pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
- pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
- if (pmpid > 0 || pmstart > start_time - 3)
- {
- /*
- * OK, seems to be a valid pidfile from our child.
- */
- int portnum;
- char *sockdir;
- char *hostaddr;
- char host_str[MAXPGPATH];
-
- pm_pid = pmpid;
-
- /*
- * Extract port number and host string to use. Prefer
- * using Unix socket if available.
- */
- portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
- sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
- hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
-
- /*
- * While unix_socket_directories can accept relative
- * directories, libpq's host parameter must have a
- * leading slash to indicate a socket directory. So,
- * ignore sockdir if it's relative, and try to use TCP
- * instead.
- */
- if (sockdir[0] == '/')
- strlcpy(host_str, sockdir, sizeof(host_str));
- else
- strlcpy(host_str, hostaddr, sizeof(host_str));
-
- /* remove trailing newline */
- if (strchr(host_str, '\n') != NULL)
- *strchr(host_str, '\n') = '\0';
-
- /* Fail if couldn't get either sockdir or host addr */
- if (host_str[0] == '\0')
- {
- fprintf(stderr, _("Relative socket directory is not supported\n"));
- return false;
- }
-
- /* If postmaster is listening on "*", use localhost */
- if (strcmp(host_str, "*") == 0)
- strcpy(host_str, "localhost");
-
- /*
- * We need to set connect_timeout otherwise on Windows
- * the Service Control Manager (SCM) will probably
- * timeout first.
- */
- snprintf(connstr, sizeof(connstr),
- "dbname=postgres port=%d host='%s' connect_timeout=5",
- portnum, host_str);
- }
- }
+ if ((pmpid = get_pgpid()) != 0 &&
+ postmaster_is_alive((pid_t) pmpid))
+ break;
- /*
- * Free the results of readfile.
- *
- * This is safe to call even if optlines is NULL.
- */
- free_readfile(optlines);
- }
+ pg_usleep(1000000); /* 1 sec */
+ print_msg(VERBOSITY_VERBOSE, ".");
+ }
- /* If we have a connection string, ping the server */
- if (connstr[0] != '\0')
- {
- res = PQping(connstr);
- if (res == PQPING_OK)
- {
- break;
- }
- else if (res == PQPING_NO_ATTEMPT)
- return false;
- }
+ /* Now wait for Postmaster to either accept connections or die. */
+ for (;;)
+ {
+ res = PQping(connstr);
+ if (res == PQPING_OK)
+ break;
+ else if (res == PQPING_NO_ATTEMPT)
+ break;
/*
- * If we've been able to identify the child postmaster's PID, check
- * the process is still alive. This covers cases where the postmaster
- * successfully created the pidfile but then crashed without removing
- * it.
+ * Check if the process is still alive. This covers cases where the
+ * postmaster successfully created the pidfile but then crashed without
+ * removing it.
*/
- if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
- return false;
+ if (!postmaster_is_alive((pid_t) pmpid))
+ break;
- /* No response, or startup still in process; wait */
+ /* No response; wait */
pg_usleep(1000000); /* 1 sec */
- print_msg(".");
+ print_msg(VERBOSITY_VERBOSE, ".");
}
- return true;
+ print_msg(VERBOSITY_VERBOSE, "\n");
}
/*
* Wait for postmaster to die
*/
static void
-wait_postgres_shutdown(void)
+wait_postmaster_shutdown(void)
{
long pid;
+ print_msg(VERBOSITY_VERBOSE, "Waiting for PostgreSQL to shutdown ...");
+
for (;;)
{
if ((pid = get_pgpid()) != 0)
{
pg_usleep(1000000); /* 1 sec */
- print_msg(".");
+ print_msg(VERBOSITY_NORMAL, ".");
}
else
break;
}
+
+ print_msg(VERBOSITY_VERBOSE, "\n");
+}
+
+static bool
+file_exists(const char *path)
+{
+ struct stat statbuf;
+
+ if (stat(path, &statbuf) != 0)
+ return false;
+
+ return true;
}
static bool
-is_pg_dir(char *path)
+is_pg_dir(const char *path)
{
struct stat statbuf;
char version_file[MAXPGPATH];
return true;
}
+/*
+ * copy one file
+ */
+static void
+copy_file(char *fromfile, char *tofile)
+{
+ char *buffer;
+ int srcfd;
+ int dstfd;
+ int nbytes;
+ off_t offset;
+
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+ buffer = malloc(COPY_BUF_SIZE);
+
+ /*
+ * Open the files
+ */
+ srcfd = open(fromfile, O_RDONLY | PG_BINARY, 0);
+ if (srcfd < 0)
+ die(_("could not open file \"%s\""), fromfile);
+
+ dstfd = open(tofile, O_RDWR | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (dstfd < 0)
+ die(_("could not create file \"%s\""), tofile);
+
+ /*
+ * Do the data copying.
+ */
+ for (offset = 0;; offset += nbytes)
+ {
+ nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+ if (nbytes < 0)
+ die(_("could not read file \"%s\""), fromfile);
+ if (nbytes == 0)
+ break;
+ errno = 0;
+ if ((int) write(dstfd, buffer, nbytes) != nbytes)
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ die(_("could not write to file \"%s\""), tofile);
+ }
+ }
+
+ if (close(dstfd))
+ die(_("could not close file \"%s\""), tofile);
+
+ /* we don't care about errors here */
+ close(srcfd);
+
+ free(buffer);
+}
+
/*
* Utility functions taken from pg_ctl
*/
fclose(pidf);
return pid;
}
-
-/*
- * get the lines from a text file - return NULL if file can't be opened
- */
-static char **
-readfile(const char *path)
-{
- int fd;
- int nlines;
- char **result;
- char *buffer;
- char *linebegin;
- int i;
- int n;
- int len;
- struct stat statbuf;
-
- /*
- * Slurp the file into memory.
- *
- * The file can change concurrently, so we read the whole file into memory
- * with a single read() call. That's not guaranteed to get an atomic
- * snapshot, but in practice, for a small file, it's close enough for the
- * current use.
- */
- fd = open(path, O_RDONLY | PG_BINARY, 0);
- if (fd < 0)
- return NULL;
- if (fstat(fd, &statbuf) < 0)
- {
- close(fd);
- return NULL;
- }
- if (statbuf.st_size == 0)
- {
- /* empty file */
- close(fd);
- result = (char **) pg_malloc(sizeof(char *));
- *result = NULL;
- return result;
- }
- buffer = pg_malloc(statbuf.st_size + 1);
-
- len = read(fd, buffer, statbuf.st_size + 1);
- close(fd);
- if (len != statbuf.st_size)
- {
- /* oops, the file size changed between fstat and read */
- free(buffer);
- return NULL;
- }
-
- /*
- * Count newlines. We expect there to be a newline after each full line,
- * including one at the end of file. If there isn't a newline at the end,
- * any characters after the last newline will be ignored.
- */
- nlines = 0;
- for (i = 0; i < len; i++)
- {
- if (buffer[i] == '\n')
- nlines++;
- }
-
- /* set up the result buffer */
- result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
-
- /* now split the buffer into lines */
- linebegin = buffer;
- n = 0;
- for (i = 0; i < len; i++)
- {
- if (buffer[i] == '\n')
- {
- int slen = &buffer[i] - linebegin + 1;
- char *linebuf = pg_malloc(slen + 1);
-
- memcpy(linebuf, linebegin, slen);
- linebuf[slen] = '\0';
- result[n++] = linebuf;
- linebegin = &buffer[i + 1];
- }
- }
- result[n] = NULL;
-
- free(buffer);
-
- return result;
-}
-
-/*
- * Free memory allocated for optlines through readfile()
- */
-void
-free_readfile(char **optlines)
-{
- char *curr_line = NULL;
- int i = 0;
-
- if (!optlines)
- return;
-
- while ((curr_line = optlines[i++]))
- free(curr_line);
-
- free(optlines);
-
- return;
-}
#include "storage/shmem.h"
#include "utils/builtins.h"
+#include "utils/memutils.h"
#include "utils/pg_lsn.h"
#include "utils/syscache.h"
-char *bdr_temp_dump_directory = NULL;
-bool bdr_init_from_basedump = false;
-
-static void bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot);
-
-static void bdr_catchup_to_lsn(int cfg_index,
- XLogRecPtr target_lsn);
-
-/*
- * Search BdrWorkerCtl for a worker in dbname with init_replica set and
- * return it. The first worker found is returned (previous code should've
- * ensured there can only be one). If no match is found, return null.
- *
- * Must be called with at least a share lock on BdrWorkerCtl->lock
- *
- */
-static BdrWorker*
-find_init_replica_worker(Name dbname)
-{
- int off;
-
- Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
- /* Check whether one of our connections has init_replica set */
- for (off = 0; off < bdr_max_workers; off++)
- {
- BdrApplyWorker *aw;
- BdrConnectionConfig *cfg;
- if (BdrWorkerCtl->slots[off].worker_type != BDR_WORKER_APPLY)
- continue;
+char *bdr_temp_dump_directory = NULL;
- aw = &BdrWorkerCtl->slots[off].data.apply;
- cfg = bdr_connection_configs[aw->connection_config_idx];
+static void bdr_init_exec_dump_restore(BDRNodeInfo *node,
+ char *snapshot);
- if ((strcmp(cfg->dbname, NameStr(*dbname)) == 0)
- && cfg->init_replica)
- {
- return &BdrWorkerCtl->slots[off];
- }
- }
- return NULL;
-}
+static void bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn);
/*
- * Get this node's status value from the remote's bdr.bdr_nodes table
- * and return it.
+ * Make sure remote node has BDR activated (insert the security label).
*
- * If no row is found, '\0' is returned.
+ * This is only needed for UDR.
*/
-static char
-bdr_get_remote_status(PGconn *pgconn)
+static void
+bdr_remote_activate(PGconn *pgconn)
{
PGresult *res;
- char status;
- Oid param_types[] = {TEXTOID, OIDOID, OIDOID};
- const char *param_values[3];
- /* Needs to fit max length of UINT64_FORMAT */
- char sysid_str[33];
- char tlid_str[33];
- char mydatabaseid_str[33];
-
- snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
- GetSystemIdentifier());
- sysid_str[sizeof(sysid_str)-1] = '\0';
-
- snprintf(tlid_str, sizeof(tlid_str), "%u",
- ThisTimeLineID);
- tlid_str[sizeof(tlid_str)-1] = '\0';
- snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
- MyDatabaseId);
- mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
- param_values[0] = sysid_str;
- param_values[1] = tlid_str;
- param_values[2] = mydatabaseid_str;
-
- res = PQexecParams(pgconn,
- "SELECT node_status FROM bdr.bdr_nodes "
- "WHERE node_sysid = $1 AND node_timeline = $2 "
- "AND node_dboid = $3 "
- "FOR UPDATE",
- 3, param_types, param_values, NULL, NULL, 0);
+ res = PQexec(pgconn, "SELECT bdr.internal_update_seclabel()");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
- elog(FATAL, "bdr: Failed to get remote status during bdr init: state %s: %s\n",
+ elog(FATAL, "bdr: Failed to activate remote node during bdr init: state %s: %s\n",
PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
}
- if (PQntuples(res) == 0)
- /* No row found on remote, we're starting from scratch */
- status = '\0';
- else
- {
- char *status_str = PQgetvalue(res, 0, 0);
- Assert(strlen(status_str) == 1);
- status = status_str[0];
- }
PQclear(res);
-
- return status;
-}
-
-/*
- * Update/delete/insert in bdr.bdr_nodes to ensure that the bdr.bdr_nodes row
- * for this worker's node ID matches the passed status before returning.
- *
- * The special case '\0' means "remove the row".
- *
- * No fancy upsert games are required here because we ensure that only one
- * worker can be initing any one database, and that node IDs are unique across
- * a group of BDR nodes.
- */
-static char
-bdr_set_remote_status(PGconn *pgconn, const char status,
- const char prev_status)
-{
- PGresult *res;
- char *status_str;
- const uint64 sysid = GetSystemIdentifier();
- /* Needs to fit max length of UINT64_FORMAT */
- char sysid_str[33];
- char tlid_str[33];
- char mydatabaseid_str[33];
-
- if (status == prev_status)
- /* No action required (we could check the remote, but meh) */
- return status;
-
- snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT,
- GetSystemIdentifier());
- sysid_str[sizeof(sysid_str)-1] = '\0';
-
- snprintf(tlid_str, sizeof(tlid_str), "%u",
- ThisTimeLineID);
- tlid_str[sizeof(tlid_str)-1] = '\0';
-
- snprintf(mydatabaseid_str, sizeof(mydatabaseid_str), "%u",
- MyDatabaseId);
- mydatabaseid_str[sizeof(mydatabaseid_str)-1] = '\0';
-
- if (status == '\0')
- {
- Oid param_types[] = {TEXTOID, OIDOID, OIDOID};
- const char *param_values[3];
- char new_status;
-
- param_values[0] = sysid_str;
- param_values[1] = tlid_str;
- param_values[2] = mydatabaseid_str;
-
- res = PQexecParams(pgconn,
- "DELETE FROM bdr.bdr_nodes WHERE node_sysid = $1"
- " AND node_timeline = $2 AND node_dboid = $3 "
- "RETURNING node_status",
- 3, param_types, param_values, NULL, NULL, 0);
-
- if (PQresultStatus(res) != PGRES_TUPLES_OK)
- {
- elog(FATAL, "bdr: Failed to delete row from bdr_nodes: status %s: %s\n",
- PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
- }
- if (PQntuples(res) == 0)
- {
- /*
- * If prev_status was '\0' we wouldn't be here, so we should've
- * got a returned value.
- */
- elog(FATAL, "bdr: bdr.bdr_nodes row for sysid=" UINT64_FORMAT
- ", tlid=%u, dboid=%u missing, expected row with status=%c",
- sysid, ThisTimeLineID, MyDatabaseId, (int)prev_status);
- }
- status_str = PQgetvalue(res, 0, 0);
- Assert(strlen(status_str) == 1);
- new_status = status_str[0];
-
- if (new_status != prev_status)
- {
- elog(FATAL, "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
- ", timeline=%u, dboid=%u had status=%c, expected status=%c",
- sysid, ThisTimeLineID, MyDatabaseId, (int) new_status,
- (int) prev_status);
- }
-
- PQclear(res);
- }
- else
- {
- Oid param_types[] = {CHAROID, TEXTOID, OIDOID, OIDOID};
- const char *param_values[4];
- char new_status;
- char status_str[2];
-
- snprintf(status_str, 2, "%c", (int)status);
- param_values[0] = status_str;
- param_values[1] = sysid_str;
- param_values[2] = tlid_str;
- param_values[3] = mydatabaseid_str;
-
- res = PQexecParams(pgconn,
- "UPDATE bdr.bdr_nodes "
- "SET node_status = $1 "
- "WHERE node_sysid = $2 AND node_timeline = $3 "
- "AND node_dboid = $4 "
- "RETURNING ("
- " SELECT node_status FROM bdr.bdr_nodes "
- " WHERE node_sysid = $2 AND node_timeline = $3 "
- " AND node_dboid = $4"
- ")",
- 4, param_types, param_values, NULL, NULL, 0);
-
- if (PQresultStatus(res) != PGRES_TUPLES_OK)
- {
- elog(FATAL,
- "bdr: Failed to update bdr.nodes row: status %s: %s\n",
- PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
- }
- if (PQntuples(res) != 0)
- {
- char *new_status_str;
- /* Updated a row */
- new_status_str = PQgetvalue(res, 0, 0);
- Assert(strlen(status_str) == 1);
- new_status = new_status_str[0];
- if (new_status != prev_status)
- {
- elog(FATAL,
- "bdr: bdr.bdr_nodes row for node_sysid=" UINT64_FORMAT
- ", timeline=%u, dboid=%u had status=%c, expected status=%c",
- sysid, ThisTimeLineID, MyDatabaseId, (int)new_status,
- (int)prev_status);
- }
-
- PQclear(res);
- }
- else
- {
- /* No rows affected, insert a new row instead. We re-use the previous
- * query parameters. */
- PQclear(res);
- res = PQexecParams(pgconn,
- "INSERT INTO bdr.bdr_nodes"
- " (node_status, node_sysid, node_timeline, node_dboid)"
- " VALUES ($1, $2, $3, $4);",
- 4, param_types, param_values, NULL, NULL, 0);
-
- if (PQresultStatus(res) != PGRES_COMMAND_OK)
- {
- elog(FATAL,
- "bdr: Failed to insert row into bdr.bdr_nodes: status %s: %s\n",
- PQresStatus(PQresultStatus(res)),
- PQresultErrorMessage(res));
- }
- PQclear(res);
- }
- }
-
- return status;
}
static XLogRecPtr
else if (PQntuples(res) == 0)
{
/* bdr ext is not known to Pg at all */
+ *default_version = NULL;
+ *installed_version = NULL;
}
else
{
pfree(installed_version);
}
-
-static void
-bdr_drop_slot_and_replication_identifier(BdrConnectionConfig *cfg)
-{
-
- PGconn *streamConn;
- RepNodeId replication_identifier;
- NameData slot_name;
- TimeLineID timeline;
- Oid dboid;
- uint64 sysid;
- PGresult *res;
- StringInfoData query;
- char *sqlstate;
- NameData appname;
- char *remote_ident;
-
-
- elog(DEBUG1, "bdr %s: Dropping slot and local ident from connection %s",
- cfg->dbname, cfg->name);
-
- snprintf(NameStr(appname), NAMEDATALEN, "slot drop");
- (NameStr(appname))[NAMEDATALEN-1] = '\0';
-
- /* Establish BDR conn and IDENTIFY_SYSTEM */
- streamConn = bdr_connect(
- cfg->dsn, &appname,
- &sysid, &timeline, &dboid
- );
-
- bdr_build_ident_and_slotname(sysid, timeline, dboid,
- &remote_ident, &slot_name);
-
-
- StartTransactionCommand();
- replication_identifier = GetReplicationIdentifier(remote_ident, true);
-
- pfree(remote_ident);
-
- if (OidIsValid(replication_identifier))
- {
- /* Local replication identifier exists and must be dropped. */
- elog(DEBUG2, "bdr %s: Deleting local replication identifier %hu",
- cfg->dbname, replication_identifier);
- DropReplicationIdentifier(replication_identifier);
- /*
- * We should CHECKPOINT after this to make sure replication
- * identifier state gets flushed.
- */
- RequestCheckpoint(CHECKPOINT_IMMEDIATE|CHECKPOINT_FORCE);
- }
- else
- {
- elog(DEBUG2, "bdr %s: No local replication identifier to delete",
- cfg->dbname);
- }
-
- /*
- * Remove corresponding remote slot if it exists. We can't query
- * whether it exists or not silently over the replication protocol,
- * so we just try it and cope if it's missing.
- */
- initStringInfo(&query);
- appendStringInfo(&query, "DROP_REPLICATION_SLOT %s", NameStr(slot_name));
- res = PQexec(streamConn, query.data);
- if (PQresultStatus(res) == PGRES_COMMAND_OK)
- {
- elog(DEBUG2, "bdr %s: remote replication slot %s deleted",
- cfg->dbname, NameStr(slot_name));
- }
- else
- {
- /* SQLSTATE 42704 expected; others are error conditions */
- sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
- if (strcmp(sqlstate, "42704") != 0)
- {
- ereport(ERROR,
- (errmsg("'DROP_REPLICATION_SLOT %s' on bdr connection %s failed with sqlstate %s: %s",
- NameStr(slot_name), cfg->name,
- sqlstate,PQresultErrorMessage(res))));
- }
- else
- {
- elog(DEBUG2, "bdr %s: No slot to delete", cfg->dbname);
- }
- }
- CommitTransactionCommand();
- PQclear(res);
- PQfinish(streamConn);
-}
-
static void
bdr_init_replica_cleanup_tmpdir(int errcode, Datum tmpdir)
{
/*
* Use a script to copy the contents of a remote node using pg_dump and apply
- * it to the local node. Runs during slot creation to bring up a new logical
- * replica from an existing node.
+ * it to the local node. Runs during node join creation to bring up a new
+ * logical replica from an existing node. The remote dump is taken from the
+ * start position of a slot on the remote end to ensure that we never replay
+ * changes included in the dump and never miss changes.
*/
static void
-bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot)
+bdr_init_exec_dump_restore(BDRNodeInfo *node,
+ char *snapshot)
{
#ifndef WIN32
pid_t pid;
appendStringInfo(&origin_dsn,
- "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica dump'",
- cfg->dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+ "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica dump'",
+ node->init_from_dsn, BDR_LOCALID_FORMAT_ARGS);
- if (cfg->replica_local_dsn == NULL)
- elog(FATAL, "bdr init_replica: no replica_local_dsn specified");
appendStringInfo(&local_dsn,
- "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica restore'",
- cfg->replica_local_dsn, BDR_LOCALID_FORMAT_ARGS, cfg->name);
+ "%s fallback_application_name='"BDR_LOCALID_FORMAT": init_replica restore'",
+ node->local_dsn, BDR_LOCALID_FORMAT_ARGS);
/*
* Suppress replication of changes applied via pg_restore back to
* the local node.
*
- * XXX DYNCONF: This should PQconninfoParse, modify the options keyword or
- * add it, and reconstruct the string using the functions from pg_dumpall
- * (also to be used for init_copy). This is a hack.
+ * TODO: This should PQconninfoParse, modify the options keyword or add
+ * it, and reconstruct the string using the functions from pg_dumpall
+ * (also to be used for init_copy). Simply appending the options
+ * instead is a bit dodgy.
*/
appendStringInfoString(&local_dsn,
" options='-c bdr.do_not_replicate=on -c bdr.permit_unsafe_ddl_commands=on -c bdr.skip_ddl_replication=on -c bdr.skip_ddl_locking=on'");
ereport(LOG,
(errmsg("Creating replica with: %s --snapshot %s --source \"%s\" --target \"%s\" --tmp-directory \"%s\", --pg-dump-path \"%s\", --pg-restore-path \"%s\"",
- bdr_init_replica_script_path, snapshot, cfg->dsn,
- cfg->replica_local_dsn, tmpdir,
+ bdr_init_replica_script_path, snapshot,
+ node->init_from_dsn, node->local_dsn, tmpdir,
bdr_dump_path, bdr_restore_path)));
n = execv(bdr_init_replica_script_path, argv);
#endif
}
+/*
+ * BDR state synchronization.
+ */
static void
-bdr_init_replica_conn_close(int code, Datum connptr)
+bdr_sync_nodes(PGconn *remote_conn, BDRNodeInfo *local_node)
{
- PGconn **conn_p;
- PGconn *conn;
+ PGconn *local_conn;
- conn_p = (PGconn**) DatumGetPointer(connptr);
- Assert(conn_p != NULL);
- conn = *conn_p;
+ local_conn = bdr_connect_nonrepl(local_node->local_dsn, "init");
- if (conn == NULL)
- return;
- if (PQstatus(conn) != CONNECTION_OK)
- return;
- PQfinish(conn);
+ PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&local_conn));
+ {
+ StringInfoData query;
+ PGresult *res;
+ char sysid_str[33];
+ const char *const setup_query =
+ "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+ "SET LOCAL search_path = bdr, pg_catalog;\n"
+ "SET LOCAL bdr.permit_unsafe_ddl_commands = on;\n"
+ "SET LOCAL bdr.skip_ddl_replication = on;\n"
+ "SET LOCAL bdr.skip_ddl_locking = on;\n"
+ "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+ "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+ /* Setup the environment. */
+ res = PQexec(remote_conn, setup_query);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "BEGIN or table locking on remote failed: %s",
+ PQresultErrorMessage(res));
+ PQclear(res);
+
+ res = PQexec(local_conn, setup_query);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "BEGIN or table locking on local failed: %s",
+ PQresultErrorMessage(res));
+ PQclear(res);
+
+ /* Copy remote bdr_nodes entries to the local node. */
+ bdr_copytable(remote_conn, local_conn,
+ "COPY (SELECT * FROM bdr.bdr_nodes) TO stdout",
+ "COPY bdr.bdr_nodes FROM stdin");
+
+ /* Copy the local entry to remote node. */
+ initStringInfo(&query);
+ /* No need to quote as everything is numbers. */
+ snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, local_node->sysid);
+ sysid_str[sizeof(sysid_str)-1] = '\0';
+ appendStringInfo(&query,
+ "COPY (SELECT * FROM bdr.bdr_nodes WHERE "
+ "node_sysid = '%s' AND node_timeline = '%u' "
+ "AND node_dboid = '%u') TO stdout",
+ sysid_str, local_node->timeline, local_node->dboid);
+
+ bdr_copytable(local_conn, remote_conn,
+ query.data, "COPY bdr.bdr_nodes FROM stdin");
+
+ /*
+ * Copy remote connections to the local node.
+ *
+ * Adding local connection to remote node is handled separately
+ * because it triggers the connect-back process on the remote node(s).
+ */
+ bdr_copytable(remote_conn, local_conn,
+ "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+ "COPY bdr.bdr_connections FROM stdin");
+
+ /* Save changes. */
+ res = PQexec(remote_conn, "COMMIT");
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "COMMIT on remote failed: %s",
+ PQresultErrorMessage(res));
+ PQclear(res);
+
+ res = PQexec(local_conn, "COMMIT");
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "COMMIT on remote failed: %s",
+ PQresultErrorMessage(res));
+ PQclear(res);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&local_conn));
+ PQfinish(local_conn);
+}
+
+static void
+bdr_insert_remote_conninfo(PGconn *conn, BdrConnectionConfig *myconfig)
+{
+#define INTERNAL_NODE_JOIN_NPARAMS 6
+ PGresult *res;
+ Oid types[INTERNAL_NODE_JOIN_NPARAMS] = { TEXTOID, OIDOID, OIDOID, TEXTOID, INT4OID, TEXTARRAYOID };
+ const char *values[INTERNAL_NODE_JOIN_NPARAMS];
+ StringInfoData replicationsets;
+
+ /* Needs to fit max length of UINT64_FORMAT */
+ char sysid_str[33];
+ char tlid_str[33];
+ char mydatabaseid_str[33];
+ char apply_delay[33];
+
+ initStringInfo(&replicationsets);
+
+ stringify_my_node_identity(sysid_str, sizeof(sysid_str),
+ tlid_str, sizeof(tlid_str),
+ mydatabaseid_str, sizeof(mydatabaseid_str));
+
+ values[0] = &sysid_str[0];
+ values[1] = &tlid_str[0];
+ values[2] = &mydatabaseid_str[0];
+ values[3] = myconfig->dsn;
+
+ snprintf(&apply_delay[0], 33, "%d", myconfig->apply_delay);
+ values[4] = &apply_delay[0];
+ /*
+ * Replication sets are stored as a quoted identifier list. To turn
+ * it into an array literal we can just wrap some brackets around it.
+ */
+ appendStringInfo(&replicationsets, "{%s}", myconfig->replication_sets);
+ values[5] = replicationsets.data;
+
+ res = PQexecParams(conn,
+ "SELECT bdr.internal_node_join($1,$2,$3,$4,$5,$6);",
+ INTERNAL_NODE_JOIN_NPARAMS,
+ types, &values[0], NULL, NULL, 0);
+
+ /*
+ * bdr.internal_node_join() must correctly handle unique violations.
+ * Otherwise init that resumes after slot creation, when we're waiting
+ * for inbound slots, will fail.
+ */
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ elog(ERROR, "unable to update remote bdr.bdr_connections: %s",
+ PQerrorMessage(conn));
+
+#undef INTERNAL_NODE_JOIN_NPARAMS
}
/*
- * Determine whether we need to initialize the database from a remote
- * node and perform the required initialization if so.
+ * Find all connections other than our own using the copy of
+ * bdr.bdr_connections that we acquired from the remote server during
+ * apply. Apply workers won't be started yet, we're just making the
+ * slots.
+ *
+ * If the slot already exists from a prior attempt we'll leave it
+ * alone. It'll be advanced when we start replaying from it anyway,
+ * and it's guaranteed to retain more than the WAL we need.
*/
-void
-bdr_init_replica(Name dbname)
+static void
+bdr_init_make_other_slots()
{
- char status;
- XLogRecPtr min_remote_lsn;
- PGconn *nonrepl_init_conn;
- StringInfoData dsn;
- BdrWorker *init_replica_worker;
- BdrConnectionConfig *init_replica_config;
- int spi_ret;
+ List *configs;
+ ListCell *lc;
+ MemoryContext old_context;
- initStringInfo(&dsn);
+ Assert(!IsTransactionState());
+ StartTransactionCommand();
+ old_context = MemoryContextSwitchTo(TopMemoryContext);
+ configs = bdr_read_connection_configs();
+ MemoryContextSwitchTo(old_context);
+ CommitTransactionCommand();
+
+ foreach(lc, configs)
+ {
+ BdrConnectionConfig *cfg = lfirst(lc);
+ PGconn *conn;
+ NameData slot_name;
+ uint64 sysid;
+ TimeLineID timeline;
+ Oid dboid;
+ RepNodeId replication_identifier;
+ char *snapshot;
+
+ if (cfg->sysid == GetSystemIdentifier() &&
+ cfg->timeline == ThisTimeLineID &&
+ cfg->dboid == MyDatabaseId)
+ {
+ /* Don't make a slot pointing to ourselves */
+ continue;
+ bdr_free_connection_config(cfg);
+ }
+
+ conn = bdr_establish_connection_and_slot(cfg->dsn, "mkslot", &slot_name,
+ &sysid, &timeline, &dboid, &replication_identifier,
+ &snapshot);
+
+ /* Ensure the slot points to the node the conn info says it should */
+ if (cfg->sysid != sysid ||
+ cfg->timeline != timeline ||
+ cfg->dboid != dboid)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("System identification mismatch between connection and slot"),
+ errdetail("Connection for "BDR_LOCALID_FORMAT" resulted in slot on node "BDR_LOCALID_FORMAT" instead of expected node",
+ cfg->sysid, cfg->timeline, cfg->dboid, EMPTY_REPLICATION_NAME,
+ sysid, timeline, dboid, EMPTY_REPLICATION_NAME)));
+ }
+
+ /* We don't require the snapshot IDs here */
+ if (snapshot != NULL)
+ pfree(snapshot);
+
+ /* No replication for now, just close the connection */
+ PQfinish(conn);
- elog(DEBUG2, "bdr %s: bdr_init_replica",
- NameStr(*dbname));
+ elog(DEBUG2, "Ensured existence of slot %s on "BDR_LOCALID_FORMAT,
+ NameStr(slot_name), cfg->sysid, cfg->timeline, cfg->dboid,
+ EMPTY_REPLICATION_NAME);
+
+ bdr_free_connection_config(cfg);
+ }
+
+ list_free(configs);
+}
+
+/*
+ * For each outbound connection in bdr.bdr_connections we should have a local
+ * replication slot created by a remote node using our connection info.
+ *
+ * Wait until all such entries are created and active, then return.
+ */
+static void
+bdr_init_wait_for_slot_creation()
+{
+ List *configs;
+ ListCell *lc;
+ Name* slot_names;
+ Size n_slots;
+ int tup_idx, arr_idx;
+
+ elog(INFO, "waiting for all inbound slots to be established");
/*
- * The local SPI transaction we're about to perform must do any writes as a
- * local transaction, not as a changeset application from a remote node.
- * That allows rows to be repliated to other nodes. So no replication_origin_id
- * may be set.
+ * Determine the list of expected slot identifiers. These are
+ * inbound slots, so they're our db oid + the remote's bdr ident.
*/
- Assert(replication_origin_id == InvalidRepNodeId);
+ StartTransactionCommand();
+ configs = bdr_read_connection_configs();
+
+ slot_names = (Name*)palloc0(sizeof(Name) * list_length(configs));
+
+ n_slots = 0;
+ foreach(lc, configs)
+ {
+ BdrConnectionConfig *cfg = lfirst(lc);
+ Name slot_name;
+
+ if (cfg->sysid == GetSystemIdentifier() &&
+ cfg->timeline == ThisTimeLineID &&
+ cfg->dboid == MyDatabaseId)
+ {
+ /* We won't see an inbound slot from our own node */
+ continue;
+ }
+
+ /* There's no corresponding incoming slot for a unidirectional slot */
+ if (cfg->is_unidirectional)
+ continue;
+
+ slot_name = (NameData*) palloc0(sizeof(NameData));
+ bdr_slot_name(slot_name, cfg->sysid, cfg->timeline, cfg->dboid,
+ MyDatabaseId);
+
+ elog(DEBUG2, "expecting inbound slot named %s", NameStr(*slot_name));
+
+ slot_names[n_slots++] = slot_name;
+ }
/*
- * Check the local bdr.bdr_nodes over SPI or direct scan to see if
- * there's an entry for ourselves in ready mode already.
+ * Wait for each to be created. There's no useful way to be notified when a
+ * slot gets created, so just scan all slots to see if all the ones we want
+ * are present and active. If not, sleep and retry soon.
*
- * Note that we don't have to explicitly SPI_finish(...) on error paths;
- * that's taken care of for us.
+ * This is a very inefficient approach but for the number of slots we're
+ * interested in it doesn't matter.
*/
- StartTransactionCommand();
- spi_ret = SPI_connect();
- if (spi_ret != SPI_OK_CONNECT)
- elog(ERROR, "SPI already connected; this shouldn't be possible");
+ SPI_connect();
- status = bdr_nodes_get_local_status(GetSystemIdentifier(), ThisTimeLineID,
- MyDatabaseId);
- if (status == 'r')
+ while (true)
{
- /* Already in ready state, nothing more to do */
- elog(DEBUG2, "init_replica: Already inited");
- SPI_finish();
- CommitTransactionCommand();
- return;
+ Datum values[1] = {MyDatabaseId};
+ Oid types[1] = {OIDOID};
+ Size n_slots_found = 0;
+
+ SPI_execute_with_args("select slot_name "
+ "from pg_catalog.pg_replication_slots "
+ "where plugin = '"BDR_LIBRARY_NAME"' "
+ "and slot_type = 'logical' "
+ "and datoid = $1 and active",
+ 1, types, values, NULL, false, 0);
+
+ for (tup_idx = 0; tup_idx < SPI_processed; tup_idx++)
+ {
+ char *slot_name;
+
+ slot_name = SPI_getvalue(SPI_tuptable->vals[tup_idx],
+ SPI_tuptable->tupdesc,
+ 1);
+
+ Assert(slot_name != NULL);
+
+ /*
+ * Does this slot appear in the array of expected slots and if so,
+ * have we seen it already?
+ *
+ * This is O(m*n) for m existing slots and n expected slots, but
+ * really, for this many slots, who cares.
+ */
+ for (arr_idx = 0; arr_idx < n_slots; arr_idx++)
+ {
+ if ( strcmp(NameStr(*slot_names[arr_idx]), slot_name) == 0 )
+ {
+ n_slots_found++;
+ break;
+ }
+ }
+ }
+
+ if (n_slots_found == n_slots)
+ break;
+
+ elog(DEBUG2, "found %u of %u expected slots, sleeping",
+ (uint32)n_slots_found, (uint32)n_slots);
+
+ pg_usleep(100000);
}
+ SPI_finish();
+
+ CommitTransactionCommand();
+
+ elog(INFO, "all inbound slots established");
+
/*
- * Before starting workers we must determine if we need to copy
- * initial state from a remote node. This is only necessary if
- * there is a connection with init_replica set and we do not yet
- * have an entry in the local "bdr.bdr_nodes" table for our node
- * ID showing initialisation to be complete.
+ * Should this also check all outbound workers are connected? Doing so
+ * isn't simple - checking for replication identifiers doesn't confirm that
+ * the connection is active. We'd need to talk to the apply workers or try
+ * to convey information via pg_stat_activity.
*/
- LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
- init_replica_worker = find_init_replica_worker(dbname);
- LWLockRelease(BdrWorkerCtl->lock);
- if (!init_replica_worker)
+}
+
+/*
+ * TODO DYNCONF perform_pointless_transaction
+ *
+ * This is temporary code to be removed when the full part/join protocol is
+ * introduced, at which point WAL messages should handle this. See comments on
+ * call site.
+ */
+static void
+perform_pointless_transaction(PGconn *conn, BDRNodeInfo *node)
+{
+ PGresult *res;
+
+ res = PQexec(conn, "CREATE TEMP TABLE bdr_init(a int) ON COMMIT DROP");
+ Assert(PQresultStatus(res) == PGRES_COMMAND_OK);
+ PQclear(res);
+}
+
+/*
+ * Initialize the database, from a remote node if necessary.
+ */
+void
+bdr_init_replica(BDRNodeInfo *local_node)
+{
+ char status;
+ PGconn *nonrepl_init_conn;
+ StringInfoData dsn;
+ BdrConnectionConfig *local_conn_config;
+
+ initStringInfo(&dsn);
+
+ status = local_node->status;
+
+ Assert(status != 'r');
+
+ elog(DEBUG2, "bdr_init_replica");
+
+ /*
+ * The local SPI transaction we're about to perform must do any writes as a
+ * local transaction, not as a changeset application from a remote node.
+ * That allows rows to be replicated to other nodes. So no replication_origin_id
+ * may be set.
+ */
+ Assert(replication_origin_id == InvalidRepNodeId);
+
+ /*
+ * Before starting workers we must determine if we need to copy initial
+ * state from a remote node. This is necessary unless we are the first node
+ * created or we've already completed init. If we'd already completed init
+ * we would've exited above.
+ */
+ if (local_node->init_from_dsn == NULL)
{
- if (status != '\0')
+ if (status != 'b')
{
/*
* Even though there's no init_replica worker, the local bdr.bdr_nodes table
* has an entry for our (sysid,dbname) and it isn't status=r (checked above),
- * we must've had an init_replica configured before, then removed.
+ * this should never happen
*/
- ereport(ERROR, (errmsg("bdr.bdr_nodes row with (sysid="
- UINT64_FORMAT ", dbname=%s) exists and has status=%c, but "
- "no connection with init_replica=t is configured for this "
- "database. ",
- GetSystemIdentifier(), NameStr(*dbname), status),
- errdetail("You probably configured initial setup with "
- "init_replica on a connection, then removed or changed that "
- "connection before setup completed properly. "),
- errhint("DROP and re-create the database if it has no "
- "existing content of value, or add the init_replica setting "
- "to one of the connections.")));
+ ereport(ERROR, (errmsg("bdr.bdr_nodes row with "BDR_LOCALID_FORMAT" exists and has status=%c, "
+ "but has init_from_dsn set to NULL",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, EMPTY_REPLICATION_NAME, status)));
}
/*
* No connections have init_replica=t, so there's no remote copy to do.
* We still have to ensure that bdr.bdr_nodes.status is 'r' for this
* node so that slot creation is permitted.
+ *
+ * XXX: is this actually a good idea?
*/
elog(DEBUG2, "init_replica: Marking as root/standalone node");
bdr_nodes_set_local_status('r');
- }
- /*
- * We no longer require the transaction for SPI; further work gets done on
- * the remote machine's bdr.bdr_nodes table and replicated back to us via
- * pg_dump/pg_restore, or over the walsender protocol once we start
- * replay. If we aren't just about to exit anyway.
- */
- SPI_finish();
- CommitTransactionCommand();
- if (!init_replica_worker)
- /* Cleanup done and nothing more to do */
return;
+ }
- init_replica_config = bdr_connection_configs
- [init_replica_worker->data.apply.connection_config_idx];
- elog(LOG, "bdr %s: bdr_init_replica init from connection %s",
- NameStr(*dbname), init_replica_config->name);
+ local_conn_config = bdr_get_connection_config(
+ local_node->sysid,
+ local_node->timeline,
+ local_node->dboid,
+ true);
- resetStringInfo(&dsn);
- appendStringInfo(&dsn,
- "%s fallback_application_name='"BDR_LOCALID_FORMAT": %s: init_replica setup'",
- init_replica_config->dsn, BDR_LOCALID_FORMAT_ARGS,
- init_replica_config->name);
+ elog(DEBUG1, "init_replica init from remote %s",
+ local_node->init_from_dsn);
- /*
- * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
- * system identifier. If there is, that'll tell us what stage of startup
- * we are up to and let us resume an incomplete start.
- */
- nonrepl_init_conn = PQconnectdb(dsn.data);
- if (PQstatus(nonrepl_init_conn) != CONNECTION_OK)
- {
- ereport(FATAL,
- (errmsg("bdr %s: could not connect to the upstream server in non-replication mode: %s",
- NameStr(*dbname),
- PQerrorMessage(nonrepl_init_conn))));
- }
+ nonrepl_init_conn =
+ bdr_connect_nonrepl(local_node->init_from_dsn, "init");
- PG_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
- PointerGetDatum(&nonrepl_init_conn));
+ PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&nonrepl_init_conn));
{
bdr_ensure_ext_installed(nonrepl_init_conn);
- /* Get the bdr.bdr_nodes status field for our node id from the remote */
- status = bdr_get_remote_status(nonrepl_init_conn);
-
- if (bdr_init_from_basedump)
- {
- status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
- }
- else
+ switch (status)
{
- switch (status)
- {
- case '\0':
- elog(DEBUG2, "bdr %s: initializing from clean state",
- NameStr(*dbname));
- break;
+ case 'b':
+ elog(DEBUG2, "initializing from clean state");
+ break;
- case 'r':
- /*
- * Init has been completed, but we didn't check our local
- * bdr.bdr_nodes, or the final update hasn't propagated yet.
- *
- * All we need to do is catch up, we already replayed enough to be
- * consistent and start up in normal mode last time around
- */
- elog(DEBUG2, "bdr %s: init already completed, nothing to do",
- NameStr(*dbname));
- return;
-
- case 'c':
- /*
- * We were in catchup mode when we died. We need to resume catchup
- * mode up to the expected LSN before switching over.
- *
- * To do that all we need to do is fall through without doing any
- * slot re-creation, dump/apply, etc, and pick up when we do
- * catchup.
- *
- * We won't know what the original catchup target point is, but we
- * can just catch up to whatever xlog position the server is
- * currently at.
- */
- elog(DEBUG2, "bdr %s: dump applied, need to continue catchup",
- NameStr(*dbname));
- break;
+ case 'r':
+ elog(ERROR, "unexpected state");
- case 'i':
- /*
- * A previous init attempt seems to have failed. Clean up, then
- * fall through to start setup again.
- *
- * We can't just re-use the slot and replication identifier that
- * were created last time (if they were), because we have no way
- * of getting the slot's exported snapshot after
- * CREATE_REPLICATION_SLOT.
- */
- elog(DEBUG2, "bdr %s: previous failed initalization detected, cleaning up",
- NameStr(*dbname));
- bdr_drop_slot_and_replication_identifier(init_replica_config);
- status = bdr_set_remote_status(nonrepl_init_conn, '\0', status);
- break;
+ case 'c':
+ /*
+ * We were in catchup mode when we died. We need to resume catchup
+ * mode up to the expected LSN before switching over.
+ *
+ * To do that all we need to do is fall through without doing any
+ * slot re-creation, dump/apply, etc, and pick up where we do
+ * catchup.
+ *
+ * We won't know what the original catchup target point is, but we
+ * can just catch up to whatever xlog position the server is
+ * currently at, it's guaranteed to be later than the target
+ * position.
+ */
+ elog(DEBUG2, "dump applied, need to continue catchup");
+ break;
- default:
- elog(ERROR, "unreachable"); /* Unhandled case */
- break;
- }
+ case 'o':
+ elog(DEBUG2, "dump applied and catchup completed, need to continue slot creation");
+ break;
+
+ case 'i':
+ /*
+ * A previous init attempt seems to have failed.
+ * Clean up, then fall through to start setup
+ * again.
+ *
+ * We can't just re-use the slot and replication
+ * identifier that were created last time (if
+ * they were), because we have no way of getting
+ * the slot's exported snapshot after
+ * CREATE_REPLICATION_SLOT.
+ *
+ * We could drop and re-create the slot, but...
+ *
+ * We also have no way to undo a failed
+ * pg_restore, so if that phase fails it's
+ * necessary to do manual cleanup, dropping and
+ * re-creating the db.
+ *
+ * To avoid that We need to be able to run
+ * pg_restore --clean, and that needs a way to
+ * exclude the bdr schema, the bdr extension,
+ * and their dependencies like plpgsql and
+ * btree_gist. (TODO patch pg_restore for that)
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("previous init failed, manual cleanup is required"),
+ errdetail("Found bdr.bdr_nodes entry for "BDR_LOCALID_FORMAT" with state=i in remote bdr.bdr_nodes", BDR_LOCALID_FORMAT_ARGS),
+ errhint("Remove all replication identifiers and slots corresponding to this node from the init target node then drop and recreate this database and try again")));
+ break;
+
+ default:
+ elog(ERROR, "unreachable %c", status); /* Unhandled case */
+ break;
}
- if (status == '\0')
+ if (status == 'b')
{
- int off;
- int *my_conn_idxs;
- int n_conns = 0;
char *init_snapshot = NULL;
PGconn *init_repl_conn = NULL;
+ NameData slot_name;
+ uint64 remote_sysid;
+ TimeLineID remote_timeline;
+ Oid remote_dboid;
+ RepNodeId repnodeid;
- elog(LOG, "bdr %s: initializing from remote db", NameStr(*dbname));
+ elog(INFO, "initializing node");
/*
* We're starting from scratch or have cleaned up a previous failed
* attempt.
*/
- status = bdr_set_remote_status(nonrepl_init_conn, 'i', status);
+ status = 'i';
+ bdr_nodes_set_local_status(status);
/*
- * A list of all connections to make slots for, as indexes into
- * BdrWorkerCtl.
+ * This is unidirectional subscribe, let the other node know that
+ * it should behave as BDR node (as it might be UDR node which does
+ * not require init).
*/
- my_conn_idxs = (int*)palloc(sizeof(Size) * bdr_max_workers);
+ if (local_conn_config == NULL)
+ bdr_remote_activate(nonrepl_init_conn);
- /* Collect a list of connections to make slots for. */
- LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
- for (off = 0; off < bdr_max_workers; off++)
- {
- BdrWorker *worker = &BdrWorkerCtl->slots[off];
-
- if (worker->worker_type == BDR_WORKER_APPLY)
- {
- BdrConnectionConfig * const cfg = bdr_connection_configs
- [worker->data.apply.connection_config_idx];
+ /*
+ * Now establish our slot on the target node, so we can replay
+ * changes from that node. It'll be used in catchup mode.
+ */
+ init_repl_conn = bdr_establish_connection_and_slot(
+ local_node->init_from_dsn,
+ "init", &slot_name,
+ &remote_sysid, &remote_timeline, &remote_dboid,
+ &repnodeid, &init_snapshot);
- if (strcmp(cfg->dbname, NameStr(*dbname)) == 0)
- my_conn_idxs[n_conns++] = off;
- }
- }
- LWLockRelease(BdrWorkerCtl->lock);
+ elog(INFO, "connected to target node "BDR_LOCALID_FORMAT
+ " with snapshot %s",
+ remote_sysid, remote_timeline, remote_dboid,
+ EMPTY_REPLICATION_NAME, init_snapshot);
- elog(DEBUG2, "bdr %s: creating slots for %d nodes",
- NameStr(*dbname), n_conns);
+ /*
+ * Take the remote dump and apply it. This will give us a local
+ * copy of bdr_connections to work from. It's guaranteed that
+ * everything after this dump will be accessible via the catchup
+ * mode slot created earlier.
+ */
+ bdr_init_exec_dump_restore(local_node, init_snapshot);
/*
- * For each connection, ensure its slot exists.
+ * TODO DYNCONF copy replication identifier state
+ *
+ * Should copy the target node's pg_catalog.pg_replication_identifier
+ * state for each node to the local node, using the same snapshot
+ * we used to take the dump from the remote. Doing this ensures
+ * that when we create slots to the target nodes they'll begin
+ * replay from a position that's exactly consistent with what's
+ * in the dump.
*
- * Do it one by one rather than fiddling with async libpq queries. If
- * this needs to be parallelized later, it should probably be done by
- * launching each apply worker and letting them create their own
- * slots, then having them wait until signalled/unlatched before
- * proceeding with actual replication. That'll save us another round
- * of connections too.
+ * We'll still need catchup mode because there's no guarantee our
+ * newly created slots will force all WAL we'd need to be retained
+ * on each node. The target might be behind. So we should catchup
+ * replay until the replication identifier positions received from
+ * catchup are >= the creation positions of the slots we made.
*
- * We don't attempt any cleanup if slot creation fails, we just bail out
- * and leave any already-created slots in place.
+ * (We don't need to do this if we instead send a replay confirmation
+ * request and wait for a reply from each node.)
*/
- for (off = 0; off < n_conns; off++)
- {
- BdrWorker *w = &BdrWorkerCtl->slots[my_conn_idxs[off]];
- BdrConnectionConfig *cfg;
- char *snapshot = NULL;
- PGconn *conn = NULL;
- RepNodeId replication_identifier;
- NameData slot_name;
- uint64 sysid;
- Oid dboid;
- TimeLineID timeline;
-
- cfg = bdr_connection_configs
- [w->data.apply.connection_config_idx];
-
- ereport(LOG,
- (errmsg("bdr %s: checking/creating slot for %s at %s",
- NameStr(*dbname), cfg->name, cfg->dsn)));
- /*
- * Create the slot on the remote. The returned remote sysid and
- * timeline, the slot name, and the local replication identifier
- * are all discarded; they're not needed here, and will be obtained
- * again by the apply workers when they're launched after init.
- */
- conn = bdr_establish_connection_and_slot(cfg->dsn, "slot",
- &slot_name, &sysid, &timeline, &dboid, &replication_identifier,
- &snapshot);
- /* Always throws rather than returning failure */
- Assert(conn);
+ PQfinish(init_repl_conn);
+ pfree(init_snapshot);
- if (w == init_replica_worker)
- {
- /*
- * We need to keep the snapshot ID returned by CREATE SLOT so
- * we can pass it to pg_dump to get a consistent dump from the
- * remote slot's start point.
- *
- * The snapshot is only valid for the lifetime of the
- * replication connection we created it with, so we must keep
- * that connection around until the dump finishes.
- */
- if (!snapshot)
- elog(ERROR, "bdr %s: init_replica failed to create snapshot!",
- NameStr(*dbname));
- init_snapshot = snapshot;
- init_repl_conn = conn;
- }
- else
- {
- /*
- * Just throw the returned info away; we only needed to create
- * the slot so its replication identifier can be advanced
- * during catchup.
- */
- if (snapshot)
- pfree(snapshot);
- PQfinish(conn);
- }
+ /*
+ * This is group join, copy the state (bdr_nodes and
+ * bdr_connections) over from the init node to our node.
+ */
+ if (local_conn_config != NULL)
+ {
+ elog(DEBUG1, "syncing bdr_nodes and bdr_connections");
+ bdr_sync_nodes(nonrepl_init_conn, local_node);
}
- pfree(my_conn_idxs);
+ status = 'c';
+ bdr_nodes_set_local_status(status);
+ elog(DEBUG1, "dump and apply finished, preparing for catchup replay");
+ }
+
+ Assert(status != 'b');
+
+ if (status == 'c')
+ {
+ XLogRecPtr min_remote_lsn;
+ remote_node_info ri;
+
+ /*
+ * Launch outbound connections to all other nodes. It doesn't
+ * matter that their slot horizons are after the dump was taken on
+ * the origin node, so we could never replay all the data we need
+ * if we switched to replaying from these slots now. We'll be
+ * advancing them in catchup mode until they overtake their current
+ * position before switching to replaying from them directly.
+ */
+ bdr_init_make_other_slots();
- /* If we get here, we should have a valid snapshot to dump */
- Assert(init_snapshot != NULL);
- Assert(init_repl_conn != NULL);
+ /*
+ * Enter catchup mode and wait until we've replayed up to the LSN
+ * the remote was at when we started catchup.
+ *
+ * TODO: It's possible that this step can lose transactions that
+ * were committed on a 3rd party node before we made our slot on it
+ * but not replicated to the init target node until after we exit
+ * catchup mode. If we acquire the DDL lock during join we can know
+ * that can't happen, so we should do that.
+ */
+ elog(DEBUG3, "getting LSN to replay to in catchup mode");
+ min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
/*
- * Execute the dump and apply its self.
+ * Catchup cannot complete if there isn't at least one remote transaction
+ * to replay. So we perform a dummy transaction on the target node.
*
- * Note that the bdr extension tables override pg_dump's default and
- * ask to be included in dumps. In particular, bdr.bdr_nodes will get
- * copied over.
+ * XXX This is a hack. What we really *should* be doing is asking
+ * the target node to send a catchup confirmation wal message, then
+ * wait until all its current peers (we aren' one yet) reply with
+ * confirmation. Then we should be replaying until we get
+ * confirmation of this from the init target node, rather than
+ * replaying to some specific LSN. The full part/join
+ * protocol should take care of this.
*/
- elog(DEBUG1, "bdr %s: creating and restoring dump for %s",
- NameStr(*dbname), init_replica_config->name);
- bdr_exec_init_replica(init_replica_config, init_snapshot);
- PQfinish(init_repl_conn);
+ elog(DEBUG3, "forcing a new transaction on the target node");
+ perform_pointless_transaction(nonrepl_init_conn, local_node);
- pfree(init_snapshot);
- status = bdr_set_remote_status(nonrepl_init_conn, 'c', status);
+ bdr_get_remote_nodeinfo_internal(nonrepl_init_conn, &ri);
+
+ /* Launch the catchup worker and wait for it to finish */
+ elog(DEBUG1, "launching catchup mode apply worker");
+ bdr_catchup_to_lsn(&ri, min_remote_lsn);
+
+ free_remote_node_info(&ri);
+
+ /*
+ * We're done with catchup. The next phase is inserting our
+ * conninfo, so set status=o
+ */
+ status = 'o';
+ bdr_nodes_set_local_status(status);
+ elog(DEBUG1, "catchup worker finished, requesting slot creation");
}
- Assert(status == 'c');
+ /* To reach here we must be waiting for slot creation */
+ Assert(status == 'o');
+
+ /*
+ * It is now safe to start apply workers, as we've finished catchup.
+ * Doing so ensures that we will replay our own bdr.bdr_nodes changes
+ * from the target node and also makes sure we stay more up-to-date,
+ * reducing slot lag on other nodes.
+ */
+ bdr_launch_apply_workers(MyDatabaseId);
- /* Launch the catchup worker and wait for it to finish */
- elog(DEBUG1, "bdr %s: launching catchup mode apply worker", NameStr(*dbname));
- min_remote_lsn = bdr_get_remote_lsn(nonrepl_init_conn);
- bdr_catchup_to_lsn(
- init_replica_worker->data.apply.connection_config_idx,
- min_remote_lsn);
- status = bdr_set_remote_status(nonrepl_init_conn, 'r', status);
+ /*
+ * Insert our connection info on the remote end. This will prompt
+ * the other end to connect back to us and make a slot, and will
+ * cause the other nodes to do the same when they receive the new
+ * row.
+ *
+ * It makes no sense to do this with UDR, where the peer doesn't
+ * connect back to us.
+ */
+ if (local_conn_config != NULL)
+ {
+ elog(DEBUG1, "inserting our connection into into remote end");
+ bdr_insert_remote_conninfo(nonrepl_init_conn, local_conn_config);
+ }
+
+ /*
+ * Wait for all outbound and inbound slot creation to be complete.
+ *
+ * The inbound slots aren't yet required to relay local writes to
+ * remote nodes, but they'll be used to write our catchup
+ * confirmation request WAL message, so we need them to exist.
+ *
+ * This makes no sense on UDR, where the init target doesn't
+ * connect back to us and no other inbound or outbound connections
+ * exist. It still gets run, but we won't find any inbound
+ * slots to look for.
+ */
+ elog(DEBUG1, "waiting for all inbound slots to be created");
+ bdr_init_wait_for_slot_creation();
- elog(INFO, "bdr %s: catchup worker finished, ready for normal replication",
- NameStr(*dbname));
+ /*
+ * We now have inbound and outbound slots for all nodes, and
+ * we're caught up to a reasonably recent state from the target
+ * node thanks to the dump and catchup mode operation.
+ *
+ * Set the node state to 'r'eady and allow writes.
+ *
+ * TODO: Before we can really be sure we're ready we should be
+ * sending a replay confirmation request and waiting for all
+ * nodes to reply, so we know we have full communication.
+ */
+ status = 'r';
+ bdr_nodes_set_local_status(status);
+ elog(INFO, "finished init_replica, ready to enter normal replication");
}
- PG_END_ENSURE_ERROR_CLEANUP(bdr_init_replica_conn_close,
- PointerGetDatum(&nonrepl_init_conn));
+ PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&nonrepl_init_conn));
+
+ Assert(status == 'r');
PQfinish(nonrepl_init_conn);
}
* When we finish applying and the worker exits, we'll be caught up with the
* remote and in a consistent state where all our local replication identifiers
* are consistent with the actual state of the local DB.
- *
- * Arguments:
- *
- * cfg_index: Index of the bdr connection for this dbname with init_worker=t
- * set within bdr_connection_configs. Used to start the worker.
- *
- * target_lsn: LSN of immediate origin node at which catchup should stop.
*/
static void
-bdr_catchup_to_lsn(int cfg_index,
- XLogRecPtr target_lsn)
+bdr_catchup_to_lsn(remote_node_info *ri, XLogRecPtr target_lsn)
{
uint32 worker_shmem_idx;
BdrWorker *worker;
- BdrConnectionConfig *cfg;
-
- cfg = bdr_connection_configs[cfg_index];
- Assert(cfg != NULL);
- Assert(cfg->init_replica);
+ BdrApplyWorker *catchup_worker;
- elog(DEBUG1, "Registering bdr apply catchup worker %s for db %s to lsn %X/%X",
- cfg->name, cfg->dbname,
+ elog(DEBUG1, "Registering bdr apply catchup worker for "BDR_LOCALID_FORMAT" to lsn %X/%X",
+ ri->sysid, ri->timeline, ri->dboid, EMPTY_REPLICATION_NAME,
(uint32)(target_lsn>>32), (uint32)target_lsn);
/* Create the shmem entry for the catchup worker */
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &worker_shmem_idx);
+ catchup_worker = &worker->data.apply;
+ catchup_worker->dboid = MyDatabaseId;
+ catchup_worker->remote_sysid = ri->sysid;
+ catchup_worker->remote_timeline = ri->timeline;
+ catchup_worker->remote_dboid = ri->dboid;
+ LWLockRelease(BdrWorkerCtl->lock);
/*
* Launch the catchup worker, ensuring that we free the shmem slot for the
BackgroundWorkerHandle *bgw_handle;
pid_t bgw_pid;
pid_t prev_bgw_pid = 0;
- BdrApplyWorker *catchup_worker = &worker->data.apply;
-
- /* Make sure the catchup worker can find its bdr.xxx_ GUCs */
- catchup_worker->connection_config_idx = cfg_index;
+ uint32 worker_arg;
/* Special parameters for a catchup worker only */
catchup_worker->replay_stop_lsn = target_lsn;
strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
bgw.bgw_restart_time = BGW_NEVER_RESTART;
+ Assert(MyProc->pid != 0);
bgw.bgw_notify_pid = MyProc->pid;
- bgw.bgw_main_arg = Int32GetDatum(worker_shmem_idx);
+
+ Assert(worker_shmem_idx <= UINT16_MAX);
+ worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_shmem_idx;
+ bgw.bgw_main_arg = Int32GetDatum(worker_arg);
snprintf(bgw.bgw_name, BGW_MAXLEN,
- "bdr %s: catchup apply to %X/%X on %s",
- cfg->dbname,
- (uint32)(target_lsn >> 32), (uint32)target_lsn,
- cfg->name);
+ "bdr: catchup apply to %X/%X",
+ (uint32)(target_lsn >> 32), (uint32)target_lsn);
bgw.bgw_name[BGW_MAXLEN-1] = '\0';
/* Launch the catchup worker and wait for it to start */
{
/* Worker must've died before it finished */
elog(ERROR,
- "bdr %s: catchup worker exited before catching up to target LSN %X/%X",
- cfg->dbname,
+ "catchup worker exited before catching up to target LSN %X/%X",
(uint32)(target_lsn>>32), (uint32)target_lsn);
}
else
{
- elog(DEBUG1, "bdr %s: catchup worker caught up to target LSN",
- cfg->dbname);
+ elog(DEBUG1, "catchup worker caught up to target LSN");
}
}
PG_END_ENSURE_ERROR_CLEANUP(bdr_catchup_to_lsn_cleanup,
#include "lib/ilist.h"
+#define EMPTY_REPLICATION_NAME ""
#define BDR_SLOT_NAME_FORMAT "bdr_%u_%s_%u_%u__%s"
#define BDR_NODE_ID_FORMAT "bdr_"UINT64_FORMAT"_%u_%u_%u_%s"
-/* GUC storage for a configured BDR connection. */
+/* A configured BDR connection from bdr_connections */
typedef struct BdrConnectionConfig
{
- char *dsn;
- int apply_delay;
- bool init_replica;
- char *replica_local_dsn;
- char *replication_sets;
+ uint64 sysid;
+ TimeLineID timeline;
+ Oid dboid;
/*
- * These aren't technically GUCs, but are per-connection config
- * information obtained from the GUCs.
+ * If the origin_ id fields are set then they must refer to our node,
+ * otherwise we wouldn't load the configuration entry. So if origin_is_set
+ * is false the origin was zero, and if true the origin is the local node
+ * id.
*/
- char *name;
- char *dbname;
+ bool origin_is_my_id;
- /* Connection config might be broken (blank dsn, etc) */
- bool is_valid;
+ /*
+ * Is this connection unidirectional, or should we expect a reciprocal
+ * inbound connection and slot?
+ */
+ bool is_unidirectional;
+
+ char *dsn;
+
+ int apply_delay;
+
+ /* Quoted identifier-list of replication sets */
+ char *replication_sets;
} BdrConnectionConfig;
typedef struct BdrFlushPosition
extern void bdr_error_nodeids_must_differ(uint64 sysid, TimeLineID timeline,
Oid dboid);
+extern List* bdr_read_connection_configs(void);
+extern BdrConnectionConfig* bdr_get_connection_config(uint64 sysid,
+ TimeLineID timeline,
+ Oid dboid,
+ bool missing_ok);
+
+extern void bdr_free_connection_config(BdrConnectionConfig *cfg);
+
+extern void bdr_slot_name(Name slot_name, uint64 sysid, TimeLineID tlid,
+ Oid dboid, Oid local_dboid);
#endif /* BDR_INTERNAL_H */
track_commit_timestamp = on
-bdr.connections = 'node1to2,node1to3,node2to3,node2to1,node3to1,node3to2'
-
-bdr.node1to2_dsn = 'dbname=node2'
-bdr.node1to2_local_dbname = 'node1'
-bdr.node1to3_dsn = 'dbname=node3'
-bdr.node1to3_local_dbname = 'node1'
-
-bdr.node2to1_dsn = 'dbname=node1'
-bdr.node2to1_local_dbname = 'node2'
-#bdr.node2to1_init_replica=on
-#bdr.node2to1_replica_local_dsn='dbname=node2'
-bdr.node2to3_dsn = 'dbname=node3'
-bdr.node2to3_local_dbname = 'node2'
-
-bdr.node3to1_dsn = 'dbname=node1'
-bdr.node3to1_local_dbname = 'node3'
-#bdr.node3to1_init_replica=on
-#bdr.node3to1_replica_local_dsn='dbname=node3'
-bdr.node3to2_dsn = 'dbname=node2'
-bdr.node3to2_local_dbname = 'node3'
-
bdr.log_conflicts_to_table = True
bdr.default_apply_delay = 100
#log_min_messages = 'debug4'
#log_line_prefix = 'd=%d p=%p a=%a%q '
-log_statement = 'all'
+#log_statement = 'all'
max_worker_processes = 18
bdr_label_init(void)
{
/* Security label provider hook */
- register_label_provider("bdr", bdr_object_relabel);
+ register_label_provider(BDR_SECLABEL_PROVIDER, bdr_object_relabel);
}
static void
* bdr_label.h
*/
+#define BDR_SECLABEL_PROVIDER "bdr"
+
extern void bdr_label_init(void);
/* shmem init hook to chain to on startup, if any */
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
-/* number of per database slots */
-static int bdr_locks_num_databases;
-
/* this database's state */
static BdrLocksDBState *bdr_my_locks_database = NULL;
Size size = 0;
size = add_size(size, sizeof(BdrLocksCtl));
- size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_locks_num_databases));
+ size = add_size(size, mul_size(sizeof(BdrLocksDBState), bdr_max_databases));
return size;
}
/* Needs to be called from a shared_preload_library _PG_init() */
void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
{
/* Must be called from postmaster its self */
Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
bdr_locks_ctl = NULL;
- bdr_locks_num_databases = num_used_databases;
RequestAddinShmemSpace(bdr_locks_shmem_size());
RequestAddinLWLocks(1);
int off;
int free_off = -1;
- for(off = 0; off < bdr_locks_num_databases; off++)
+ for(off = 0; off < bdr_max_databases; off++)
{
BdrLocksDBState *db = &bdr_locks_ctl->dbstate[off];
db->in_use = true;
return db;
}
- /*
- * Shouldn't happen with BDR statically configured, as the shmem segment
- * gets sized for the number of BDR-enabled databases. Later will be
- * affected by any bdr_max_databases setting or whatever we add.
- */
- ereport(PANIC,
+
+ ereport(ERROR,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
- "Too many databases in use with BDR"));
+ errmsg("Too many databases BDR-enabled for bdr.max_databases"),
+ errhint("Increase bdr.max_databases above the current limit of %d", bdr_max_databases)));
}
static void
* Called from the per-db worker.
*/
void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
{
Relation rel;
SysScanDesc scan;
if (bdr_my_locks_database->locked_and_loaded)
return;
- bdr_my_locks_database->nnodes = nnodes;
+ /* We haven't yet established how many nodes we're connected to. */
+ bdr_my_locks_database->nnodes = 0;
initStringInfo(&s);
bdr_my_locks_database->locked_and_loaded = true;
}
+void
+bdr_locks_set_nnodes(Size nnodes)
+{
+ Assert(IsBackgroundWorker);
+ Assert(bdr_my_locks_database != NULL);
+
+ /*
+ * XXX DYNCONF No protection against node addition during DDL lock acquire
+ *
+ * Node counts are currently grabbed straight from the perdb worker's shmem
+ * and could change whenever someone adds a worker, with no locking or
+ * protection.
+ *
+ * We could acquire the local DDL lock before setting the nodecount, which
+ * would cause requests from other nodes to get rejected and cause other
+ * local tx's to fail to request the global DDL lock. However, we'd have to
+ * acquire it when we committed to adding the new worker, which happens in
+ * a user backend, and release it from the perdb worker once the new worker
+ * is registered. Fragile.
+ *
+ * Doing so also fails to solve the other half of the problem, which is
+ * that DDL locking expects there to be one bdr walsender for each apply
+ * worker, i.e. each connection should be reciprocal. We could connect to
+ * the other end and register a connection back to us, but that's getting
+ * complicated for what's always going to be a temporary option before a
+ * full part/join protocol is added.
+ *
+ * So we're just going to cross our fingers. Worst case is that DDL locking
+ * gets stuck and we have to restart all the nodes.
+ *
+ * The full part/join protocol will solve this by acquiring the DDL lock
+ * before joining.
+ */
+ bdr_my_locks_database->nnodes = nnodes;
+}
+
static void
bdr_prepare_message(StringInfo s, BdrMessageType message_type)
bdr_locks_find_my_database(false);
+ if (bdr_my_locks_database->nnodes == 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("No peer nodes or peer node count unknown, cannot acquire DDL lock"),
+ errhint("BDR is probably still starting up, wait a while")));
+ }
+
elog(DEBUG2, "attempting to acquire global DDL lock for (" BDR_LOCALID_FORMAT ")", BDR_LOCALID_FORMAT_ARGS);
/* send message about ddl lock */
* Another node has asked us to confirm that we've replayed up to a given LSN.
* We've seen the request message, so send the requested confirmation.
*
- * Runs in the walsender.
+ * Runs in the apply worker.
*/
void
bdr_process_request_replay_confirm(uint64 sysid, TimeLineID tli,
/* bdr_locks are not used by UDR at the moment */
void
-bdr_locks_startup(Size nnodes)
+bdr_locks_startup()
{
}
void
-bdr_locks_shmem_init(Size num_used_databases)
+bdr_locks_shmem_init()
{
}
BDR_MESSAGE_REPLAY_CONFIRM = 6
} BdrMessageType;
-void bdr_locks_startup(Size nnodes);
+void bdr_locks_startup(void);
+void bdr_locks_set_nnodes(Size nnodes);
void bdr_acquire_ddl_lock(void);
void bdr_process_acquire_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid);
void bdr_process_release_ddl_lock(uint64 sysid, TimeLineID tli, Oid datid,
bool client_float8_byval;
bool client_int_datetime;
char *client_db_encoding;
+ bool client_unidirectional;
Oid bdr_schema_oid;
Oid bdr_conflict_handlers_reloid;
Oid bdr_locks_reloid;
* If this function returns it's safe to begin replay.
*/
static void
-bdr_ensure_node_ready()
+bdr_ensure_node_ready(BdrOutputData *data)
{
int spi_ret;
const uint64 sysid = GetSystemIdentifier();
char status;
+ BDRNodeInfo *node;
NameData dbname;
char *tmp_dbname;
+ /* Unidirectional connections don't require any checks atm. */
+ if (data->client_unidirectional)
+ return;
+
/* We need dbname valid outside this transaction, so copy it */
tmp_dbname = get_database_name(MyDatabaseId);
strncpy(NameStr(dbname), tmp_dbname, NAMEDATALEN);
if (spi_ret != SPI_OK_CONNECT)
elog(ERROR, "Local SPI connect failed; shouldn't happen");
- status = bdr_nodes_get_local_status(sysid, ThisTimeLineID, MyDatabaseId);
+ node = bdr_nodes_get_local_info(sysid, ThisTimeLineID, MyDatabaseId);
+ status = node == NULL ? '\0' : node->status;
+ bdr_bdr_node_free(node);
SPI_finish();
-/*
- * There is no local node status for UDR as we have only connection to this
- * node coming from a slave. The above is still useful to make sure the
- * extension is installed in the db.
- */
-#ifdef BUILDING_UDR
- switch (status)
- {
- case 'r':
- case '\0':
- case 'c':
- case 'i':
- break;
- default:
- elog(ERROR, "Unhandled case status=%c", status);
- break;
- }
-#else
-
- /* Complain if node isn't ready. */
+ /*
+ * Complain if node isn't ready,
+ * i.e. state is fully 'r'eady, or waiting for inbound sl'o't creation.
+ */
/* TODO: Allow soft error so caller can sleep and recheck? */
- if (status != 'r')
+ if (status != 'r' && status != 'o')
{
const char * const base_msg =
"bdr output plugin: slot creation rejected, bdr.bdr_nodes entry for local node (sysid=" UINT64_FORMAT
switch (status)
{
case 'r':
+ case 'o':
break; /* unreachable */
case '\0':
+ case 'b':
/*
* Can't allow replay when BDR hasn't started yet, as
* replica init might still need to run, causing a dump to
break;
}
}
-#endif
}
data->client_db_encoding = pstrdup(strVal(elem->arg));
else if (strcmp(elem->defname, "forward_changesets") == 0)
bdr_parse_bool(elem, &data->forward_changesets);
+ else if (strcmp(elem->defname, "unidirectional") == 0)
+ bdr_parse_bool(elem, &data->client_unidirectional);
else if (strcmp(elem->defname, "replication_sets") == 0)
{
int i;
StartTransactionCommand();
}
-#ifdef BUILDING_BDR
- /*
- * If running BDR, we expect the remote end (us) to have the BDR extension
- * installed before we permit slot creation. This prevents replication of
- * the CREATE EXTENSION bdr; command its self.
- */
+ /* BDR extension must be installed. */
if (get_namespace_oid("bdr", true) == InvalidOid)
{
ereport(ERROR,
BDR_LOCALID_FORMAT_ARGS),
errdetail("Cannot create a BDR slot without the BDR extension installed")));
}
-#endif
/* no options are passed in during initialization, so don't complain there */
if (!is_init)
if (data->client_db_encoding == NULL)
bdr_req_param("db_encoding");
+#ifdef BUILDING_UDR
+ /* Can't do bidirectional connection on UDR. */
+ if (!data->is_unidirectional)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("UDR only supports unidirectional connections")));
+
+#endif
+
/* check incompatibilities we cannot work around */
if (strcmp(data->client_db_encoding, GetDatabaseEncodingName()) != 0)
elog(ERROR, "mismatching encodings are not yet supported");
if (data->client_pg_version / 100 != PG_VERSION_NUM / 100)
data->allow_sendrecv_protocol = false;
- bdr_maintain_schema();
+ bdr_maintain_schema(false);
data->bdr_schema_oid = get_namespace_oid("bdr", true);
schema_oid = data->bdr_schema_oid;
* This'll ERROR out if we're not ready. Note that this does NOT
* prevent slot creation, only START_REPLICATION from the slot.
*/
- bdr_ensure_node_ready();
+ bdr_ensure_node_ready(data);
}
if (tx_started)
#include "utils/memutils.h"
#include "utils/snapmgr.h"
+PG_FUNCTION_INFO_V1(bdr_connections_changed);
+
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS);
+
+/* In the commit hook, should we attempt to start a per-db worker? */
+static bool xacthook_connection_added = false;
+
+/*
+ * Scan shmem looking for a perdb worker for the named DB and
+ * return its offset. If not found, return -1.
+ *
+ * Must hold the LWLock on the worker control segment in at
+ * least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually
+ * started up.
+ */
+int
+find_perdb_worker_slot(Oid dboid, BdrWorker **worker_found)
+{
+ int i, found = -1;
+
+ Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+ for (i = 0; i < bdr_max_workers; i++)
+ {
+ BdrWorker *w = &BdrWorkerCtl->slots[i];
+ if (w->worker_type == BDR_WORKER_PERDB)
+ {
+ BdrPerdbWorker *pw = &w->data.perdb;
+ if (pw->database_oid == dboid)
+ {
+ found = i;
+ if (worker_found != NULL)
+ *worker_found = w;
+ break;
+ }
+ }
+ }
+
+ return found;
+}
+
+/*
+ * Scan shmem looking for an apply worker for the current perdb worker and
+ * specified target node identifier and return its offset. If not found, return
+ * -1.
+ *
+ * Must hold the LWLock on the worker control segment in at least share mode.
+ *
+ * Note that there's no guarantee that the worker is actually started up.
+ */
+static int
+find_apply_worker_slot(uint64 sysid, TimeLineID timeline, Oid dboid, BdrWorker **worker_found)
+{
+ int i, found = -1;
+
+ Assert(bdr_worker_type == BDR_WORKER_PERDB);
+ Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+ for (i = 0; i < bdr_max_workers; i++)
+ {
+ BdrWorker *w = &BdrWorkerCtl->slots[i];
+ if (w->worker_type == BDR_WORKER_APPLY)
+ {
+ BdrApplyWorker *aw = &w->data.apply;
+ if (aw->dboid == MyDatabaseId
+ && aw->remote_sysid == sysid
+ && aw->remote_timeline == timeline
+ && aw->remote_dboid == dboid)
+ {
+ found = i;
+ if (worker_found != NULL)
+ *worker_found = w;
+ break;
+ }
+ }
+ }
+
+ return found;
+}
+
+static void
+bdr_perdb_xact_callback(XactEvent event, void *arg)
+{
+ switch (event)
+ {
+ case XACT_EVENT_COMMIT:
+ if (xacthook_connection_added)
+ {
+ int slotno;
+ BdrWorker *w;
+
+ xacthook_connection_added = false;
+
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * If a perdb worker already exists, wake it and tell it to
+ * check for new connections.
+ */
+ slotno = find_perdb_worker_slot(MyDatabaseId, &w);
+ if (slotno >= 0)
+ {
+ /*
+ * The worker is registered, but might not be started yet
+ * (or could be crashing and restarting). If it's not
+ * started the latch will be zero. If it's started but
+ * dead, the latch will be bogus, but it's safe to set a
+ * proclatch to a dead process. At worst we'll set a latch
+ * for the wrong process, and that's fine. If it's zero
+ * then the worker is still starting and will see our new
+ * changes anyway.
+ */
+ if (w->data.perdb.proclatch != NULL)
+ SetLatch(w->data.perdb.proclatch);
+ }
+ else
+ {
+ /*
+ * Per-db worker doesn't exist, ask the supervisor to check for
+ * changes and register new per-db workers for labeled
+ * databases.
+ */
+ if (BdrWorkerCtl->supervisor_latch)
+ SetLatch(BdrWorkerCtl->supervisor_latch);
+ }
+
+ LWLockRelease(BdrWorkerCtl->lock);
+ }
+ break;
+ default:
+ /* We're not interested in other tx events */
+ break;
+ }
+}
+
+/*
+ * Prepare to launch a perdb worker for the current DB if it's not already
+ * running, and register a XACT_EVENT_COMMIT hook to perform the actual launch
+ * when the addition of the worker commits.
+ *
+ * If a perdb worker is already running, notify it to check for new connections.
+ */
+Datum
+bdr_connections_changed(PG_FUNCTION_ARGS)
+{
+ /* If there's already a per-db worker for our DB we have nothing to do */
+ if (!xacthook_connection_added)
+ {
+ RegisterXactCallback(bdr_perdb_xact_callback, NULL);
+ xacthook_connection_added = true;
+ }
+ PG_RETURN_VOID();
+}
+
+static int
+getattno(const char *colname)
+{
+ int attno;
+
+ attno = SPI_fnumber(SPI_tuptable->tupdesc, colname);
+ if (attno == SPI_ERROR_NOATTRIBUTE)
+ elog(ERROR, "SPI error while reading %s from bdr.bdr_connections", colname);
+
+ return attno;
+}
+
/*
* Launch a dynamic bgworker to run bdr_apply_main for each bdr connection on
* the database identified by dbname.
*
- * Scans the BdrWorkerCtl shmem segment for workers of type BDR_WORKER_APPLY
- * with a matching database name and launches them.
+ * Scans the bdr.bdr_connections table for workers and launch a worker for any
+ * connection that doesn't already have one.
*/
-static List*
-bdr_launch_apply_workers(char *dbname)
+void
+bdr_launch_apply_workers(Oid dboid)
{
- List *apply_workers = NIL;
- BackgroundWorker apply;
- int i;
-
+ BackgroundWorker bgw;
+ int i, ret;
+ Size nnodes = 0;
+#define BDR_CON_Q_NARGS 3
+ Oid argtypes[BDR_CON_Q_NARGS] = { TEXTOID, OIDOID, OIDOID };
+ Datum values[BDR_CON_Q_NARGS];
+ char sysid_str[33];
+
+ /* Should be called from the perdb worker */
Assert(IsBackgroundWorker);
+ Assert(bdr_worker_type == BDR_WORKER_PERDB);
+
+ snprintf(sysid_str, sizeof(sysid_str), UINT64_FORMAT, GetSystemIdentifier());
+ sysid_str[sizeof(sysid_str)-1] = '\0';
+
+ elog(DEBUG2, "launching apply workers");
+
+ /*
+ * It's easy enough to make this tolerant of an open tx, but in general
+ * rollback doesn't make sense here.
+ */
+ Assert(!IsTransactionState());
/* Common apply worker values */
- apply.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
BGWORKER_BACKEND_DATABASE_CONNECTION;
- apply.bgw_start_time = BgWorkerStart_RecoveryFinished;
- apply.bgw_main = NULL;
- strncpy(apply.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
- strncpy(apply.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
- apply.bgw_restart_time = 5;
- apply.bgw_notify_pid = 0;
-
- /* Launch apply workers */
- LWLockAcquire(BdrWorkerCtl->lock, LW_SHARED);
- for (i = 0; i < bdr_max_workers; i++)
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ bgw.bgw_main = NULL;
+ strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+ strncpy(bgw.bgw_function_name, "bdr_apply_main", BGW_MAXLEN);
+ bgw.bgw_restart_time = 5;
+ bgw.bgw_notify_pid = 0;
+
+ StartTransactionCommand();
+
+ /*
+ * Look up connection entries for all nodes other than our own.
+ *
+ * If an entry with our origin (sysid,tlid,dboid) exists, treat that as
+ * overriding the generic one.
+ */
+ values[0] = CStringGetTextDatum(sysid_str);
+ values[1] = ObjectIdGetDatum(ThisTimeLineID);
+ values[2] = ObjectIdGetDatum(MyDatabaseId);
+
+ SPI_connect();
+
+ ret = SPI_execute_with_args(
+ "SELECT DISTINCT ON (conn_sysid, conn_timeline, conn_dboid) "
+ " conn_sysid, conn_timeline, conn_dboid, "
+ " conn_is_unidirectional, "
+ " conn_origin_dboid <> 0 AS origin_is_my_id "
+ "FROM bdr.bdr_connections "
+ "WHERE ( "
+ " (conn_origin_sysid = '0' AND "
+ " conn_origin_timeline = 0 AND "
+ " conn_origin_dboid = 0) "
+ " OR "
+ " (conn_origin_sysid = $1 AND "
+ " conn_origin_timeline = $2 AND "
+ " conn_origin_dboid = $3) "
+ " ) AND NOT ( "
+ " conn_sysid = $1 AND "
+ " conn_timeline = $2 AND "
+ " conn_dboid = $3"
+ " ) "
+ "ORDER BY conn_sysid, conn_timeline, conn_dboid, "
+ " conn_origin_sysid ASC NULLS LAST, "
+ " conn_timeline ASC NULLS LAST, "
+ " conn_dboid ASC NULLS LAST ",
+ BDR_CON_Q_NARGS, argtypes, values, NULL,
+ false, 0);
+
+ if (ret != SPI_OK_SELECT)
+ elog(ERROR, "SPI error while querying bdr.bdr_connections");
+
+ nnodes = SPI_processed;
+
+ elog(DEBUG2, "found %u workers in bdr_connections", (uint32)nnodes);
+
+ for (i = 0; i < SPI_processed; i++)
{
- BdrWorker *worker = &BdrWorkerCtl->slots[i];
+ BackgroundWorkerHandle *bgw_handle;
+ HeapTuple tuple;
+ uint32 slot;
+ uint32 worker_arg;
+ BdrWorker *worker;
+ BdrApplyWorker *apply;
+ Datum temp_datum;
+ bool isnull;
+ uint64 target_sysid;
+ TimeLineID target_timeline;
+ Oid target_dboid;
+ char* tmp_sysid;
+ bool origin_is_my_id,
+ conn_is_unidirectional;
+
+ tuple = SPI_tuptable->vals[i];
+
+ tmp_sysid = SPI_getvalue(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_sysid"));
+
+ if (sscanf(tmp_sysid, UINT64_FORMAT, &target_sysid) != 1)
+ elog(ERROR, "Parsing sysid uint64 from %s failed", tmp_sysid);
+
+ temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_timeline"),
+ &isnull);
+ Assert(!isnull);
+ target_timeline = DatumGetObjectId(temp_datum);
+
+ temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_dboid"),
+ &isnull);
+ Assert(!isnull);
+ target_dboid = DatumGetObjectId(temp_datum);
+
+ temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("conn_is_unidirectional"),
+ &isnull);
+ Assert(!isnull);
+ conn_is_unidirectional = DatumGetBool(temp_datum);
+
+ temp_datum = SPI_getbinval(tuple, SPI_tuptable->tupdesc,
+ getattno("origin_is_my_id"),
+ &isnull);
+ Assert(!isnull);
+ origin_is_my_id = DatumGetBool(temp_datum);
+
+ elog(DEBUG2, "Found bdr_connections entry for "BDR_LOCALID_FORMAT" (origin specific: %d, unidirectional: %d)",
+ target_sysid, target_timeline, target_dboid,
+ EMPTY_REPLICATION_NAME, (int)origin_is_my_id, (int)conn_is_unidirectional);
+
+ Assert(!LWLockHeldByMe(BdrWorkerCtl->lock));
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
- switch(worker->worker_type)
+ /*
+ * Is there already a worker registered for this connection?
+ *
+ * TODO DYNCONF Each apply worker should have its latch set and respond
+ * by checking to see whether it needs to apply any new configuration.
+ */
+ if (find_apply_worker_slot(target_sysid, target_timeline, target_dboid, NULL) != -1)
{
- case BDR_WORKER_APPLY:
- {
- BdrApplyWorker *con = &worker->data.apply;
- BdrConnectionConfig *cfg =
- bdr_connection_configs[con->connection_config_idx];
- Assert(cfg != NULL);
- if ( strcmp(cfg->dbname, dbname) == 0 )
- {
- /* It's an apply worker for our DB; register it */
- BackgroundWorkerHandle *bgw_handle;
-
- if (con->bgw_is_registered)
- /*
- * This worker was registered on a previous pass;
- * this is probably a restart of the per-db worker.
- * Don't register a duplicate.
- */
- continue;
-
- snprintf(apply.bgw_name, BGW_MAXLEN,
- BDR_LOCALID_FORMAT": %s: apply",
- BDR_LOCALID_FORMAT_ARGS, cfg->name);
- apply.bgw_main_arg = Int32GetDatum(i);
-
- if (!RegisterDynamicBackgroundWorker(&apply,
- &bgw_handle))
- {
- ereport(ERROR,
- (errmsg("bdr: Failed to register background worker"
- " %s, see previous log messages",
- cfg->name)));
- }
- /* We've launched this one, don't do it again */
- con->bgw_is_registered = true;
- apply_workers = lcons(bgw_handle, apply_workers);
- }
- }
- break;
- case BDR_WORKER_EMPTY_SLOT:
- case BDR_WORKER_PERDB:
- /* Nothing to do; switch only so we get warnings for insane cases */
- break;
- default:
- /* Bogus value */
- elog(FATAL, "Unhandled BdrWorkerType case %i, memory corruption?",
- worker->worker_type);
- break;
+ elog(DEBUG2, "Skipping registration of worker for node "BDR_LOCALID_FORMAT" on db oid=%u: already registered",
+ target_sysid, target_timeline, target_dboid,
+ EMPTY_REPLICATION_NAME, dboid);
+ LWLockRelease(BdrWorkerCtl->lock);
+ continue;
+ }
+
+ /* Set the display name in 'ps' etc */
+ snprintf(bgw.bgw_name, BGW_MAXLEN,
+ BDR_LOCALID_FORMAT"->"BDR_LOCALID_FORMAT,
+ BDR_LOCALID_FORMAT_ARGS,
+ target_sysid, target_timeline, target_dboid,
+ EMPTY_REPLICATION_NAME);
+
+ /* Allocate a new shmem slot for this apply worker */
+ worker = bdr_worker_shmem_alloc(BDR_WORKER_APPLY, &slot);
+
+ /* Tell the apply worker what its shmem slot is */
+ Assert(slot <= UINT16_MAX);
+ worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)slot;
+ bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+ /*
+ * Apply workers (other than in catchup mode, which are registered
+ * elsewhere) should not be using the local node's connection entry.
+ */
+ Assert(!(target_sysid == GetSystemIdentifier() &&
+ target_timeline == ThisTimeLineID &&
+ target_dboid == MyDatabaseId));
+
+ /* Now populate the apply worker state */
+ apply = &worker->data.apply;
+ apply->dboid = MyDatabaseId;
+ apply->remote_sysid = target_sysid;
+ apply->remote_timeline = target_timeline;
+ apply->remote_dboid = target_dboid;
+ apply->replay_stop_lsn = InvalidXLogRecPtr;
+ apply->forward_changesets = false;
+
+ LWLockRelease(BdrWorkerCtl->lock);
+
+ /*
+ * Finally, register the worker for launch.
+ */
+ if (!RegisterDynamicBackgroundWorker(&bgw,
+ &bgw_handle))
+ {
+ /*
+ * Already-registered workers will keep on running. We need to
+ * make sure the slot we just acquired but failed to launch a
+ * worker for gets released again though.
+ */
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+ apply->dboid = InvalidOid;
+ apply->remote_sysid = 0;
+ apply->remote_timeline = 0;
+ apply->remote_dboid = InvalidOid;
+ worker->worker_type = BDR_WORKER_EMPTY_SLOT;
+ LWLockRelease(BdrWorkerCtl->lock);
+
+ ereport(ERROR,
+ (errmsg("bdr: Failed to register background worker"
+ " for "BDR_LOCALID_FORMAT", see previous log messages",
+ BDR_LOCALID_FORMAT_ARGS)));
+ }
+ else
+ {
+ elog(DEBUG2, "registered apply worker for "BDR_LOCALID_FORMAT,
+ target_sysid, target_timeline, target_dboid,
+ EMPTY_REPLICATION_NAME);
}
}
- LWLockRelease(BdrWorkerCtl->lock);
- return apply_workers;
+ SPI_finish();
+
+ CommitTransactionCommand();
+
+ elog(DEBUG2, "done registering apply workers");
+
+ /*
+ * Now we need to tell the lock manager and the sequence
+ * manager about the changed node count.
+ *
+ * There's no truly safe way to do this without a proper
+ * part/join protocol, so all we're going to do is update
+ * the node count in shared memory.
+ */
+ bdr_worker_slot->data.perdb.nnodes = nnodes;
+#ifdef BUILDING_BDR
+ bdr_locks_set_nnodes(nnodes);
+ bdr_sequencer_set_nnodes(nnodes);
+#endif
+
+ elog(DEBUG2, "updated worker counts");
}
/*
void
bdr_perdb_worker_main(Datum main_arg)
{
- int rc = 0;
- List *apply_workers;
- ListCell *c;
- BdrPerdbWorker *perdb;
- BdrWorker *bdr_worker_slot;
- StringInfoData si;
- bool wait;
+ int rc = 0;
+ BdrPerdbWorker *perdb;
+ StringInfoData si;
+ bool wait;
+ uint32 worker_arg;
+ uint16 worker_generation;
+ uint16 perdb_worker_idx;
+ BDRNodeInfo *local_node;
initStringInfo(&si);
Assert(IsBackgroundWorker);
- bdr_worker_slot = &BdrWorkerCtl->slots[ DatumGetInt32(main_arg) ];
+ worker_arg = DatumGetInt32(main_arg);
+
+ worker_generation = (uint16)(worker_arg >> 16);
+ perdb_worker_idx = (uint16)(worker_arg & 0x0000FFFF);
+
+ if (worker_generation != BdrWorkerCtl->worker_generation)
+ {
+ elog(DEBUG1, "perdb worker from generation %d exiting after finding shmem generation is %d",
+ worker_generation, BdrWorkerCtl->worker_generation);
+ proc_exit(0);
+ }
+
+ bdr_worker_slot = &BdrWorkerCtl->slots[perdb_worker_idx];
Assert(bdr_worker_slot->worker_type == BDR_WORKER_PERDB);
perdb = &bdr_worker_slot->data.perdb;
bdr_worker_type = BDR_WORKER_PERDB;
bdr_worker_init(NameStr(perdb->dbname));
+ perdb->nnodes = 0;
+
elog(DEBUG1, "per-db worker for node " BDR_LOCALID_FORMAT " starting", BDR_LOCALID_FORMAT_ARGS);
- appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb worker");
+ appendStringInfo(&si, BDR_LOCALID_FORMAT": %s", BDR_LOCALID_FORMAT_ARGS, "perdb");
SetConfigOption("application_name", si.data, PGC_USERSET, PGC_S_SESSION);
CurrentResourceOwner = ResourceOwnerCreate(NULL, "bdr seq top-level resource owner");
bdr_saved_resowner = CurrentResourceOwner;
+ /*
+ * It's necessary to acquire a a lock here so that a concurrent
+ * bdr_perdb_xact_callback can't try to set our latch at the same
+ * time as we write to it.
+ *
+ * There's no per-worker lock, so we just take the lock on the
+ * whole segment.
+ */
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+ perdb->proclatch = &MyProc->procLatch;
+ perdb->database_oid = MyDatabaseId;
+ LWLockRelease(BdrWorkerCtl->lock);
+
/* need to be able to perform writes ourselves */
bdr_executor_always_allow_writes(true);
- bdr_locks_startup(perdb->nnodes);
+ bdr_locks_startup();
+
+ {
+ int spi_ret;
+ MemoryContext saved_ctx;
+
+ /*
+ * Check the local bdr.bdr_nodes table to see if there's an entry for
+ * our node.
+ *
+ * Note that we don't have to explicitly SPI_finish(...) on error paths;
+ * that's taken care of for us.
+ */
+ StartTransactionCommand();
+ spi_ret = SPI_connect();
+ if (spi_ret != SPI_OK_CONNECT)
+ elog(ERROR, "SPI already connected; this shouldn't be possible");
+
+ saved_ctx = MemoryContextSwitchTo(TopMemoryContext);
+ local_node = bdr_nodes_get_local_info(GetSystemIdentifier(), ThisTimeLineID,
+ MyDatabaseId);
+ MemoryContextSwitchTo(saved_ctx);
+
+ if (local_node == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("local node record not found")));
+
+ SPI_finish();
+ CommitTransactionCommand();
+ }
/*
* Do we need to init the local DB from a remote node?
- *
- * Checks bdr.bdr_nodes.status, does any remote initialization required if
- * there's an init_replica connection, and ensures that
- * bdr.bdr_nodes.status=r for our entry before continuing.
*/
- bdr_init_replica(&perdb->dbname);
+ if (local_node->status != 'r')
+ bdr_init_replica(local_node);
- elog(DEBUG1, "Starting bdr apply workers for db %s", NameStr(perdb->dbname));
+ elog(DEBUG1, "Starting bdr apply workers for "BDR_LOCALID_FORMAT" (%s)",
+ BDR_LOCALID_FORMAT_ARGS, NameStr(perdb->dbname));
/* Launch the apply workers */
- apply_workers = bdr_launch_apply_workers(NameStr(perdb->dbname));
-
- /*
- * For now, just free the bgworker handles. Later we'll probably want them
- * for adding/removing/reconfiguring bgworkers.
- */
- foreach(c, apply_workers)
- {
- BackgroundWorkerHandle *h = (BackgroundWorkerHandle *) lfirst(c);
- pfree(h);
- }
+ bdr_launch_apply_workers(MyDatabaseId);
#ifdef BUILDING_BDR
elog(DEBUG1, "BDR starting sequencer on db \"%s\"",
/* emergency bailout if postmaster has died */
if (rc & WL_POSTMASTER_DEATH)
proc_exit(1);
+
+ if (rc & WL_LATCH_SET)
+ {
+ /*
+ * If the perdb worker's latch is set we're being asked
+ * to rescan and launch new apply workers.
+ */
+ bdr_launch_apply_workers(MyDatabaseId);
+ }
}
}
+ perdb->database_oid = InvalidOid;
proc_exit(0);
}
include = 'bdr_regress_common.conf'
-bdr.connections = 'node1, node2'
-
-bdr.node1_dsn = 'dbname=postgres'
-bdr.node1_local_dbname = 'regression'
-bdr.node1_replication_sets = 'default, important, for-node-1'
-
-bdr.node2_dsn = 'dbname=regression'
-bdr.node2_local_dbname = 'postgres'
-bdr.node2_replication_sets = 'default, important, for-node-2, for-node-2-insert, for-node-2-update, for-node-2-delete'
-
bdrtest.readdb1 = 'regression'
bdrtest.readdb2 = 'postgres'
bdrtest.writedb1 = 'regression'
#include "bdr.h"
+#include "access/genam.h"
#include "access/heapam.h"
#include "access/xact.h"
#include "commands/seclabel.h"
+#include "utils/builtins.h"
#include "utils/catcache.h"
+#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/jsonapi.h"
#include "utils/json.h"
return false;
}
-#include "access/genam.h"
-#include "utils/builtins.h"
-#include "utils/fmgroids.h"
-
static HeapTuple
replset_lookup(Relation rel, const char *cname)
{
typedef struct BdrSequencerControl
{
- size_t slot;
+ int next_slot;
BdrSequencerSlot slots[FLEXIBLE_ARRAY_MEMBER];
} BdrSequencerControl;
{
/* initialize */
memset(BdrSequencerCtl, 0, bdr_sequencer_shmem_size());
+ /*
+ * next_slot allows perdb workers to allocate seq slots.
+ * The sequencer will likely be separated into a different
+ * worker later.
+ */
+ BdrSequencerCtl->next_slot = 0;
}
LWLockRelease(AddinShmemInitLock);
shmem_startup_hook = bdr_sequencer_shmem_startup;
}
+/*
+ * The perdb worker doing sequencer setup needs to know what slot to
+ * allocate for the next sequencer.
+ *
+ * This should go away once the sequencer is separated into its own
+ * worker.
+ */
+int
+bdr_sequencer_get_next_free_slot(void)
+{
+ return BdrSequencerCtl->next_slot ++;
+}
+
void
bdr_sequencer_wakeup(void)
{
bdr_seq_pending_wakeup = true;
}
+void
+bdr_sequencer_set_nnodes(Size nnodes)
+{
+ BdrSequencerSlot *slot = &BdrSequencerCtl->slots[seq_slot];
+ slot->nnodes = nnodes;
+}
+
void
bdr_sequencer_init(int new_seq_slot, Size nnodes)
{
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr_supervisor.c
+ * Cluster wide supervisor worker.
+ *
+ * Copyright (C) 2014-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * bdr_supervisor.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+#include "bdr_label.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "access/xact.h"
+
+#include "catalog/objectaddress.h"
+#include "catalog/pg_database.h"
+#include "catalog/pg_shseclabel.h"
+
+#include "commands/dbcommands.h"
+#include "commands/seclabel.h"
+
+#include "postmaster/bgworker.h"
+
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/ipc.h"
+
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/fmgroids.h"
+#include "utils/guc.h"
+
+/*
+ * Register a new perdb worker for the named database. The worker MUST
+ * not already exist.
+ *
+ * This is called by the supervisor during startup, and by user backends when
+ * the first connection is added for a database.
+ */
+static void
+bdr_register_perdb_worker(const char * dbname)
+{
+ BackgroundWorkerHandle *bgw_handle;
+ BackgroundWorker bgw;
+ BdrWorker *worker;
+ BdrPerdbWorker *perdb;
+ unsigned int worker_slot_number;
+ uint32 worker_arg;
+
+ Assert(LWLockHeldByMe(BdrWorkerCtl->lock));
+
+ elog(DEBUG2, "Registering per-db worker for db %s", dbname);
+
+ worker = bdr_worker_shmem_alloc(
+ BDR_WORKER_PERDB,
+ &worker_slot_number
+ );
+
+ perdb = &worker->data.perdb;
+
+ strncpy(NameStr(perdb->dbname),
+ dbname, NAMEDATALEN);
+ NameStr(perdb->dbname)[NAMEDATALEN-1] = '\0';
+ /* Nodecount is set when apply workers are registered */
+ perdb->nnodes = 0;
+#ifdef BUILDING_BDR
+ perdb->seq_slot = bdr_sequencer_get_next_free_slot();
+#endif
+
+ /*
+ * The rest of the perdb worker's shmem segment - proclatch
+ * and nnodes - gets set up by the worker during startup.
+ */
+
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ bgw.bgw_main = NULL;
+ strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+ strncpy(bgw.bgw_function_name, "bdr_perdb_worker_main", BGW_MAXLEN);
+ bgw.bgw_restart_time = 5;
+ bgw.bgw_notify_pid = 0;
+ snprintf(bgw.bgw_name, BGW_MAXLEN,
+ "bdr db: %s", dbname);
+
+ /*
+ * The main arg is composed of two uint16 parts - the worker
+ * generation number (see bdr_worker_shmem_startup) and the index into
+ * BdrWorkerCtl->slots in shared memory.
+ */
+ Assert(worker_slot_number <= UINT16_MAX);
+ worker_arg = (((uint32)BdrWorkerCtl->worker_generation) << 16) | (uint32)worker_slot_number;
+ bgw.bgw_main_arg = Int32GetDatum(worker_arg);
+
+ if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+ errmsg("Registering BDR worker failed, check prior log messages for details")));
+ }
+
+ elog(DEBUG2, "Registered per-db worker for %s successfully", dbname);
+}
+
+/*
+ * Check for BDR-enabled DBs and start per-db workers for any that currently
+ * lack them.
+ *
+ * TODO DYNCONF: Handle removal of BDR from DBs
+ */
+static void
+bdr_supervisor_rescan_dbs()
+{
+ Relation secrel;
+ ScanKeyData skey[2];
+ SysScanDesc scan;
+ HeapTuple secTuple;
+ int n_new_workers = 0, bdr_dbs = 0;
+
+ elog(DEBUG1, "Supervisor scanning for BDR-enabled databases");
+
+ pgstat_report_activity(STATE_RUNNING, "scanning backends");
+
+ StartTransactionCommand();
+
+ /*
+ * Scan pg_seclabel looking for entries for pg_database with the bdr label
+ * provider. We'll find all labels for the BDR provider, irrespective
+ * of value.
+ *
+ * The only index present isn't much use for this scan and using it makes
+ * us set up more keys, so do a heap scan.
+ *
+ * The lock taken on pg_shseclabel must be strong enough to conflict with
+ * the lock taken be bdr.bdr_connection_add(...) to ensure that any
+ * transactions adding new labels have commited and cleaned up before we
+ * read it. Otherwise a race between the supervisor latch being set in a
+ * commit hook and the tuples actually becoming visible is possible.
+ */
+ secrel = heap_open(SharedSecLabelRelationId, RowShareLock);
+
+ ScanKeyInit(&skey[0],
+ Anum_pg_shseclabel_classoid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(DatabaseRelationId));
+
+ ScanKeyInit(&skey[1],
+ Anum_pg_shseclabel_provider,
+ BTEqualStrategyNumber, F_TEXTEQ,
+ CStringGetTextDatum(BDR_SECLABEL_PROVIDER));
+
+ scan = systable_beginscan(secrel, InvalidOid, false, NULL, 2, &skey[0]);
+
+ /*
+ * We need to scan the shmem segment that tracks BDR workers and possibly
+ * modify it, so lock it.
+ *
+ * We have to take an exclusive lock in case we need to modify it,
+ * otherwise we'd be faced with a lock upgrade.
+ */
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * Now examine each label and if there's no worker for the labled
+ * DB already, start one.
+ */
+ while (HeapTupleIsValid(secTuple = systable_getnext(scan)))
+ {
+ FormData_pg_shseclabel *sec;
+ char *label_dbname;
+
+ sec = (FormData_pg_shseclabel*) GETSTRUCT(secTuple);
+
+ /*
+ * The per-db workers are mapped by name not oid, and that's necessary
+ * because the bgworker API requires that databases be identified by
+ * name.
+ *
+ * Look up the name of the DB with this OID and compare it. It's a bit slow,
+ * but we aren't doing this much.
+ *
+ * FIXME: Currently if a database is renamed, you'll have to restart
+ * PostgreSQL before BDR notices.
+ */
+ label_dbname = get_database_name(sec->objoid);
+
+ if (!bdr_is_bdr_activated_db(sec->objoid))
+ {
+ pfree(label_dbname);
+ continue;
+ }
+
+ elog(DEBUG1, "Found BDR-enabled database %s (oid=%i)",
+ label_dbname, sec->objoid);
+
+ bdr_dbs++;
+
+ /*
+ * Check if we have a per-db worker for this db oid already and if
+ * we don't, start one.
+ *
+ * This is O(n^2) for n BDR-enabled DBs; to be more scalable we could
+ * accumulate and sort the oids, then do a single scan of the shmem
+ * segment. But really, if you have that many DBs this cost is nothing.
+ */
+ if (find_perdb_worker_slot(sec->objoid, NULL) == -1)
+ {
+ /* No perdb worker exists for this DB, make one */
+ bdr_register_perdb_worker(label_dbname);
+ n_new_workers++;
+ } else {
+ elog(DEBUG2, "per-db worker for db %s already exists, not registering",
+ label_dbname);
+ }
+
+ pfree(label_dbname);
+ }
+
+ elog(DEBUG2, "Found %i BDR-labeled DBs; registered %i new per-db workers",
+ bdr_dbs, n_new_workers);
+
+ LWLockRelease(BdrWorkerCtl->lock);
+
+ systable_endscan(scan);
+ heap_close(secrel, RowShareLock);
+
+ CommitTransactionCommand();
+
+ elog(DEBUG2, "Finished scanning for BDR-enabled databases");
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+}
+
+/*
+ * Create the database the supervisor remains connected
+ * to, a DB with no user connections permitted.
+ *
+ * This is a workaorund for the inability to use pg_shseclabel
+ * without a DB connection; see comments in bdr_supervisor_main
+ */
+static void
+bdr_supervisor_createdb()
+{
+ Oid dboid;
+
+ StartTransactionCommand();
+
+ /* If the DB already exists, no need to create it */
+ dboid = get_database_oid("bdr", true);
+
+ if (dboid == InvalidOid)
+ {
+ CreatedbStmt stmt;
+ DefElem de_template;
+ DefElem de_connlimit;
+
+ de_template.defname = "template";
+ de_template.type = T_String;
+ de_template.arg = (Node*) makeString("template1");
+
+ de_connlimit.defname = "connectionlimit";
+ de_template.type = T_Integer;
+ de_connlimit.arg = (Node*) makeInteger(1);
+
+ stmt.dbname = "bdr";
+ stmt.options = list_make2(&de_template, &de_connlimit);
+
+ dboid = createdb(&stmt);
+
+ if (dboid == InvalidOid)
+ elog(ERROR, "Failed to create 'bdr' DB");
+
+ /* TODO DYNCONF: Add a comment to the db, and/or a dummy table */
+
+ elog(LOG, "Created database 'bdr' (oid=%i) during BDR startup", dboid);
+ }
+ else
+ {
+ elog(DEBUG3, "Database 'bdr' (oid=%i) already exists, not creating", dboid);
+ }
+
+ CommitTransactionCommand();
+
+ Assert(dboid != InvalidOid);
+}
+
+
+/*
+ * The BDR supervisor is a static bgworker that serves as the master/supervisor
+ * for all BDR workers. It exists so that BDR can be enabled and disabled
+ * dynamically for databases.
+ *
+ * It is responsible for identifying BDR-enabled databases at startup and
+ * launching their dynamic per-db workers. It should do as little else as
+ * possible, as it'll run when BDR is in shared_preload_libraries whether
+ * or not it's otherwise actually in use.
+ *
+ * The supervisor worker has no access to any database.
+ */
+void
+bdr_supervisor_worker_main(Datum main_arg)
+{
+ Assert(DatumGetInt32(main_arg) == 0);
+ Assert(IsBackgroundWorker);
+
+ pqsignal(SIGHUP, bdr_sighup);
+ pqsignal(SIGTERM, bdr_sigterm);
+ BackgroundWorkerUnblockSignals();
+
+ /*
+ * Unfortunately we currently can't access shared catalogs like
+ * pg_shseclabel (where we store information about which database use bdr)
+ * without being connected to a database. Only shared & nailed catalogs
+ * can be accessed before being connected to a database - and
+ * pg_shseclabel is not one of those.
+ *
+ * Instead we have a database "bdr" that's supposed to be empty which we
+ * just use to read pg_shseclabel. Not pretty, but it works.
+ *
+ * Without copying significant parts of InitPostgres() we can't even read
+ * pg_database without connecting to a database. As we can't connect to
+ * "no database", we must connect to one that always exists, like
+ * template1, then use it to create a dummy database to operate in.
+ *
+ * Once created we set a shmem flag and restart so we know we can connect
+ * to the newly created database.
+ */
+ if (!BdrWorkerCtl->is_supervisor_restart)
+ {
+ BackgroundWorkerInitializeConnection("template1", NULL);
+ bdr_supervisor_createdb();
+
+ BdrWorkerCtl->is_supervisor_restart = true;
+
+ elog(DEBUG1, "BDR supervisor restarting to connect to 'bdr' DB");
+ proc_exit(1);
+ }
+
+ BackgroundWorkerInitializeConnection("bdr", NULL);
+
+ LWLockAcquire(BdrWorkerCtl->lock, LW_EXCLUSIVE);
+ BdrWorkerCtl->supervisor_latch = &MyProc->procLatch;
+ LWLockRelease(BdrWorkerCtl->lock);
+
+ elog(DEBUG1, "BDR supervisor connected to DB 'bdr'");
+
+ SetConfigOption("application_name", "bdr supervisor", PGC_USERSET, PGC_S_SESSION);
+
+ /* mark as idle, before starting to loop */
+ pgstat_report_activity(STATE_IDLE, NULL);
+
+ bdr_supervisor_rescan_dbs();
+
+ while (!got_SIGTERM)
+ {
+ int rc;
+
+ /*
+ * After startup the supervisor doesn't currently have anything to do,
+ * so it can just go to sleep on its latch. It could exit after running
+ * startup, but we're expecting to need it to do other things down the
+ * track, so might as well keep it alive...
+ */
+ rc = WaitLatch(&MyProc->procLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ 180000L);
+
+ ResetLatch(&MyProc->procLatch);
+
+ /* emergency bailout if postmaster has died */
+ if (rc & WL_POSTMASTER_DEATH)
+ proc_exit(1);
+
+ if (got_SIGHUP)
+ {
+ got_SIGHUP = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ if (rc & WL_LATCH_SET)
+ {
+ /*
+ * We've been asked to launch new perdb workers if there are any
+ * changes to security labels.
+ */
+ bdr_supervisor_rescan_dbs();
+ }
+ }
+
+ proc_exit(0);
+}
+
+/*
+ * Register the BDR supervisor bgworker, which will start all the
+ * per-db workers.
+ *
+ * Called in postmaster context from _PG_init.
+ *
+ * The supervisor is guaranteed to be assigned the first shmem slot in our
+ * workers shmem array. This is vital because at this point shemem isn't
+ * allocated yet, so all we can do is tell the supervisor worker its shmem slot
+ * number then actually populate that slot when the postmaster runs our shmem
+ * init callback later.
+ */
+void
+bdr_supervisor_register()
+{
+ BackgroundWorker bgw;
+
+ Assert(IsPostmasterEnvironment && !IsUnderPostmaster);
+
+ /*
+ * The supervisor worker accesses shared relations, but does not connect to
+ * any specific database. We still have to flag it as using a connection in
+ * the bgworker API.
+ */
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ bgw.bgw_main = NULL;
+ strncpy(bgw.bgw_library_name, BDR_LIBRARY_NAME, BGW_MAXLEN);
+ strncpy(bgw.bgw_function_name, "bdr_supervisor_worker_main", BGW_MAXLEN);
+ bgw.bgw_restart_time = 1;
+ bgw.bgw_notify_pid = 0;
+ snprintf(bgw.bgw_name, BGW_MAXLEN,
+ "bdr supervisor");
+ bgw.bgw_main_arg = Int32GetDatum(0); /* unused */
+
+ RegisterBackgroundWorker(&bgw);
+}
--- /dev/null
+/* -------------------------------------------------------------------------
+ *
+ * bdr_upgrade.c
+ * Support for upgrading between BDR versions
+ *
+ * Copyright (C) 2012-2015, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * bdr_upgrade.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "bdr.h"
+
+#include "libpq-fe.h"
+#include "miscadmin.h"
+
+#include "libpq/pqformat.h"
+
+#include "catalog/pg_type.h"
+
+#include "storage/ipc.h"
+
+PGDLLEXPORT Datum bdr_upgrade_to_090(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(bdr_upgrade_to_090);
+
+static void
+bdr_upgrade_to_090_insert_connection( PGconn *conn,
+ const char *local_sysid, const char *local_timeline,
+ const char *local_dboid, const char *my_conninfo)
+{
+ PGresult *res;
+ const char *values[8];
+ Oid types[8] =
+ { TEXTOID, OIDOID, OIDOID, TEXTOID, OIDOID, OIDOID, BOOLOID, TEXTOID };
+
+ values[0] = local_sysid;
+ values[1] = local_timeline;
+ values[2] = local_dboid;
+ values[3] = "0";
+ values[4] = "0";
+ values[5] = "0";
+ values[6] = "f";
+ values[7] = &my_conninfo[0];
+ /* TODO: replication sets too! */
+
+ res = PQexecParams(conn, "INSERT INTO bdr.bdr_connections\n"
+ "(conn_sysid, conn_timeline, conn_dboid,\n"
+ " conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,\n"
+ " conn_is_unidirectional, conn_dsn)\n"
+ "VALUES ($1,$2,$3,$4,$5,$6,$7,$8)",
+ 8, types, values, NULL, NULL, false);
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ elog(ERROR, "inserting local info into bdr_connections failed with %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ PQclear(res);
+}
+
+/*
+ * Utility function for upgrading a BDR node running 0.8.0 or older to 0.9.0
+ * (dynamic configuration).
+ *
+ * This function is only used for the 2nd and subsequent nodes. It is not
+ * required or useful for upgrading the first node.
+ *
+ * This does some sanity checks to ensure the local node isn't already joined
+ * and that the remote node is actually a known peer with a bdr_nodes entry.
+ *
+ * It then copies the remote end's bdr_connections entries to the local node so
+ * the local node knows which peers to connect to. It inserts a copy of the
+ * local node's bdr_connections entry in the remote and tells the local and
+ * remote nodes to refresh their worker lists.
+ *
+ * This is one long function because it's one-shot code. It's written in C
+ * so it can re-use libpq connections across multiple steps, doing everything
+ * in one transaction.
+ */
+Datum
+bdr_upgrade_to_090(PG_FUNCTION_ARGS)
+{
+ const char *my_conninfo = PG_GETARG_CSTRING(0);
+ const char *remote_conninfo;
+ const char *my_local_conninfo = NULL;
+ PGconn *local_conn = NULL;
+ const char *local_dsn;
+
+ char local_sysid_str[33];
+ char local_timeline_str[33];
+ char local_dboid_str[33];
+
+ stringify_my_node_identity(local_sysid_str, sizeof(local_sysid_str),
+ local_timeline_str, sizeof(local_timeline_str),
+ local_dboid_str, sizeof(local_dboid_str));
+
+ if (!PG_ARGISNULL(1))
+ {
+ my_local_conninfo = PG_GETARG_CSTRING(1);
+ local_dsn = my_local_conninfo;
+ }
+ else
+ {
+ local_dsn = my_conninfo;
+ }
+
+ if (PG_ARGISNULL(2))
+ {
+ elog(NOTICE, "upgrading the first node of a BDR group (remote_conninfo was null)");
+ remote_conninfo = NULL;
+ }
+ else
+ {
+ elog(NOTICE, "upgrading the local node by connecting to an already upgraded peer node");
+ remote_conninfo = PG_GETARG_CSTRING(2);
+ }
+
+ /*
+ * Connect to the local node in non-replication mode.
+ *
+ * We'll use this connection to COPY pg_connections data, instead of having
+ * to mess around constructing and deconstructing pg_connections tuples. It
+ * also lets us commit autonomously.
+ */
+ local_conn = PQconnectdb(local_dsn);
+
+ if (PQstatus(local_conn) != CONNECTION_OK)
+ {
+ ereport(ERROR,
+ (errmsg("connection to supplied local dsn '%s' failed", local_dsn),
+ errdetail("Connection failed with %s", PQerrorMessage(local_conn))));
+ }
+
+ PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&local_conn));
+ {
+ PGconn *remote_conn = NULL;
+ PGresult *res;
+ remote_node_info ri, li, li_via_remote;
+ Oid nodeid_types[3] = { TEXTOID, OIDOID, OIDOID };
+ const char *local_nodeid_values[3];
+
+ const char * const bdr_nodes_query =
+ "SELECT 1 FROM bdr.bdr_nodes "
+ "WHERE node_sysid = $1 AND node_timeline = $2 AND node_dboid = $3";
+
+ const char * const setup_query =
+ "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;\n"
+ "SET search_path = bdr, pg_catalog;\n"
+ "SET bdr.permit_unsafe_ddl_commands = on;\n"
+ "SET bdr.skip_ddl_replication = on;\n"
+ "SET bdr.skip_ddl_locking = on;\n"
+ "LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;\n"
+ "LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;\n";
+
+ local_nodeid_values[0] = &local_sysid_str[0];
+ local_nodeid_values[1] = &local_timeline_str[0];
+ local_nodeid_values[2] = &local_dboid_str[0];
+
+ res = PQexec(local_conn, setup_query);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "BEGIN or table locking on local failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ /*
+ * Check that the local connection supplied is usable, and that the
+ * node identity of the endpoint matches the node we're being called
+ * in.
+ *
+ * This will test the local-only remote_conn if supplied, otherwise the
+ * my-dsn remote_conn. Whichever one we're using for the init process.
+ * (There's no guarantee that my-dsn is even valid from the perspective
+ * of the local node if a local_dsn was also supplied).
+ *
+ * Replication mode isn't tested here. We'll ask the peer to
+ * connect back to us later instead.
+ */
+ bdr_get_remote_nodeinfo_internal(local_conn, &li);
+
+ if (!(li.sysid == GetSystemIdentifier()
+ && li.timeline == ThisTimeLineID
+ && li.dboid == MyDatabaseId))
+ {
+ ereport(ERROR,
+ (errmsg("local dsn %s must point to the local node", local_dsn),
+ errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+ li.sysid, li.timeline, li.dboid)));
+ }
+
+ if (!li.is_superuser)
+ elog(ERROR, "local connection '%s' must have superuser rights", local_dsn);
+
+ {
+ /*
+ * Check for ourselves in local bdr_nodes by UPDATEing our local
+ * bdr_nodes entry. This will get propagated to the remote end later.
+ *
+ * These values could already be set if a prior upgrade attempt failed
+ * after a local commit and before the remote commit.
+ */
+ const char * node_status;
+ const char * bdr_nodes_update_values[5];
+
+ Oid bdr_nodes_update_types[5] =
+ { TEXTOID, OIDOID, OIDOID, TEXTOID, TEXTOID };
+
+
+ bdr_nodes_update_values[0] = &local_sysid_str[0];
+ bdr_nodes_update_values[1] = &local_timeline_str[0];
+ bdr_nodes_update_values[2] = &local_dboid_str[0];
+
+ if (local_dsn != NULL)
+ bdr_nodes_update_values[3] = local_dsn;
+ else
+ bdr_nodes_update_values[3] = NULL;
+
+ if (remote_conninfo != NULL)
+ bdr_nodes_update_values[4] = remote_conninfo;
+ else
+ bdr_nodes_update_values[4] = NULL;
+
+ res = PQexecParams(local_conn,
+ "UPDATE bdr.bdr_nodes "
+ "SET node_local_dsn = $4, "
+ " node_init_from_dsn = $5 "
+ "WHERE node_sysid = $1 "
+ " AND node_timeline = $2 "
+ " AND node_dboid = $3"
+ "RETURNING node_status",
+ 5, bdr_nodes_update_types, bdr_nodes_update_values,
+ NULL, NULL, 0);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "updating local bdr_nodes failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) != 1)
+ {
+ ereport(ERROR,
+ (errmsg("no entry for local node found in bdr.bdr_nodes"),
+ errdetail("Expected (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but no such row found in bdr_nodes",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId)));
+ }
+
+ node_status = PQgetvalue(res, 0, 0);
+
+ if (strcmp(node_status, "r") != 0)
+ {
+ ereport(ERROR,
+ (errmsg("bdr_nodes entry for local node has status != 'r'"),
+ errdetail("Row with (node_sysid="UINT64_FORMAT",node_timeline=%u,node_dboid=%u) but status = '%s' not expected 'r'",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId, node_status)));
+ }
+
+ }
+
+ /*
+ * Another sanity check: Local bdr_connections must be empty.
+ *
+ * If it isn't then a prior upgrade failed after the local commit
+ * but before the remote commit. The local bdr_connections must be
+ * deleted with replication disabled to prevent the deletion
+ * from being enqueued on the outbound slots. This is done
+ * manually by the user per the docs.
+ */
+ res = PQexec(local_conn, "SELECT 1 FROM bdr.bdr_connections");
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "querying local bdr_connections failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) > 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("the local node's bdr.bdr_connections is not empty"),
+ errdetail("No connections from the local node to other nodes may exist when upgrading"),
+ errhint("If a prior upgrade attempt failed see the documentation for recovery steps")));
+ }
+
+ PQclear(res);
+
+ /*
+ * BDR requires a security label to be set on the database in order
+ * to start up.
+ */
+ res = PQexec(local_conn, "SELECT bdr.internal_update_seclabel()");
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "setting local bdr security label failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ PQclear(res);
+
+
+ /*
+ * If this is the first node, insert an entry for ourselves into
+ * the local bdr_connections. We can't insert into the remote and
+ * have it replicate because there is no remote.
+ */
+ if (remote_conninfo == NULL)
+ {
+ bdr_upgrade_to_090_insert_connection(local_conn, local_sysid_str,
+ local_timeline_str, local_dboid_str, my_conninfo);
+ }
+
+ /*
+ * Establish the connection we'll use to copy the bdr_connections
+ * entries we need and insert our own bdr_connections entry
+ * into the remote end.
+ */
+ if (remote_conninfo != NULL)
+ {
+ StringInfoData dsn;
+
+ initStringInfo(&dsn);
+ appendStringInfo(&dsn,
+ "%s fallback_application_name='"BDR_LOCALID_FORMAT":init'",
+ remote_conninfo, BDR_LOCALID_FORMAT_ARGS);
+ /*
+ * Test to see if there's an entry in the remote's bdr.bdr_nodes for our
+ * system identifier. If there is, that'll tell us what stage of startup
+ * we are up to and let us resume an incomplete start.
+ */
+ remote_conn = PQconnectdb(dsn.data);
+ if (PQstatus(remote_conn) != CONNECTION_OK)
+ {
+ ereport(FATAL,
+ (errmsg("could not connect to the server in non-replication mode: %s",
+ PQerrorMessage(remote_conn)),
+ errdetail("dsn was: %s", dsn.data)));
+ }
+ }
+
+ PG_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&remote_conn));
+ {
+
+ char remote_sysid_str[33];
+ char remote_timeline_str[33];
+ char remote_dboid_str[33];
+ const char *remote_nodeid_values[3];
+
+ if (remote_conn != NULL)
+ {
+ res = PQexec(remote_conn, setup_query);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "BEGIN or table locking on remote failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ /*
+ * Obtain the remote node's identity so we can look it up in the local
+ * bdr_nodes and see if we recognise this node. This will also ensure
+ * BDR is installed on the remote.
+ */
+ bdr_get_remote_nodeinfo_internal(remote_conn, &ri);
+
+ if (ri.sysid == GetSystemIdentifier()
+ && ri.timeline == ThisTimeLineID
+ && ri.dboid == MyDatabaseId)
+ {
+ bdr_error_nodeids_must_differ(ri.sysid, ri.timeline, ri.dboid);
+ }
+
+ if (ri.version_num != BDR_VERSION_NUM)
+ elog(ERROR, "remote end must run BDR version %s but is running %s",
+ BDR_VERSION, ri.version);
+
+ if (!ri.is_superuser)
+ elog(ERROR, "connection must have superuser rights");
+
+ if (strcmp(ri.variant, "BDR") != 0)
+ elog(ERROR, "remote node must be running full BDR, not variant %s",
+ ri.variant);
+
+ /*
+ * As a further sanity check, make sure the remote node can connect back
+ * to the local node, and that the resulting IDs match.
+ */
+ bdr_test_remote_connectback_internal(remote_conn, &li_via_remote, my_conninfo);
+
+ if (!(li_via_remote.sysid == GetSystemIdentifier()
+ && li_via_remote.timeline == ThisTimeLineID
+ && li_via_remote.dboid == MyDatabaseId))
+ {
+ ereport(ERROR,
+ (errmsg("remote node can connect to dsn %s but it doesn't match the local node identity", my_conninfo),
+ errdetail("Expected node identity ("UINT64_FORMAT",%u,%u) but got ("UINT64_FORMAT",%u,%u)",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId,
+ li_via_remote.sysid, li_via_remote.timeline, li_via_remote.dboid)));
+ }
+
+ if (!li_via_remote.is_superuser)
+ elog(ERROR, "connection from remote node to local node using dsn '%s' must have superuser rights", my_conninfo);
+
+ /*
+ * The basics look sane. Check to see if the target node is present
+ * in the local bdr_nodes. If it isn't then we can't join it with
+ * an upgrade, because it's not an existing peer.
+ */
+
+ stringify_node_identity(remote_sysid_str, sizeof(remote_sysid_str),
+ remote_timeline_str, sizeof(remote_timeline_str),
+ remote_dboid_str, sizeof(remote_dboid_str),
+ ri.sysid, ri.timeline, ri.dboid);
+
+ remote_nodeid_values[0] = &remote_sysid_str[0];
+ remote_nodeid_values[1] = &remote_timeline_str[0];
+ remote_nodeid_values[2] = &remote_dboid_str[0];
+
+ res = PQexecParams(local_conn, bdr_nodes_query, 3, nodeid_types, remote_nodeid_values, NULL, NULL, 0);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "Querying local bdr_nodes for remote nodeid failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) == 0)
+ {
+ /* Looks like we didn't find the expected node entry */
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("The remote node identified by the passed remote connection string is not known locally"),
+ errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the local bdr.bdr_nodes",
+ ri.sysid, ri.timeline, ri.dboid),
+ errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+ }
+
+ Assert(PQntuples(res) == 1);
+
+ PQclear(res);
+
+ /*
+ * Now ensure that our node is known to the remote end
+ */
+ res = PQexecParams(remote_conn, bdr_nodes_query, 3, nodeid_types,
+ local_nodeid_values, NULL, NULL, 0);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "Querying remote bdr_nodes for local nodeid failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) == 0)
+ {
+ /*
+ * We're not known to the remote node so we can't do an upgrade
+ * join to it.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("The node identified by the passed connection string does not recognise the local node"),
+ errdetail("The local node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (node_sysid,node_timeline,node_dboid) is present in the remote bdr.bdr_nodes",
+ GetSystemIdentifier(), ThisTimeLineID, MyDatabaseId),
+ errhint("You can only upgrade a node by connecting to a node it was already joined to before the BDR version update")));
+ }
+
+ Assert(PQntuples(res) == 1);
+
+ PQclear(res);
+
+ /*
+ * We now know there's a bdr_nodes entry on each end. Ensure that the
+ * remote end contains at least a bdr_connections entry for its self
+ * and does NOT contain a connection for us.
+ */
+ res = PQexec(remote_conn,
+ "SELECT 1 "
+ "FROM bdr.bdr_connections c, "
+ " bdr.bdr_get_local_nodeid() l "
+ "WHERE c.conn_sysid = l.sysid "
+ " AND c.conn_timeline = l.timeline "
+ " AND c.conn_dboid = l.dboid "
+ );
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) != 1)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("The node identified by the passed connection string does not yet have a connection entry for its own node"),
+ errdetail("The remote node's identity is ("UINT64_FORMAT",%u,%u) but no entry for the correponding (conn_sysid,conn_timeline,conn_dboid) is present in the local bdr.bdr_connections",
+ ri.sysid, ri.timeline, ri.dboid),
+ errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+ }
+
+ PQclear(res);
+
+ res = PQexecParams(remote_conn,
+ "SELECT 1 "
+ "FROM bdr.bdr_connections c "
+ "WHERE c.conn_sysid = $1 "
+ " AND c.conn_timeline = $2 "
+ " AND c.conn_dboid = $3 ",
+ 3, nodeid_types, local_nodeid_values, NULL, NULL, 0);
+
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ {
+ elog(ERROR, "Querying remote bdr_connections failed: state %s: %s\n",
+ PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res));
+ }
+
+ if (PQntuples(res) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("The node identified by the passed connection string already has a connection string for the local node"),
+ errdetail("The local node's identity (conn_sysid="UINT64_FORMAT",conn_timeline=%u,conn_dboid=%u) already has an entry in the remote bdr.bdr_connections",
+ li.sysid, li.timeline, ri.dboid),
+ errhint("You must have already upgraded the other node before you can use it to upgrade this node.")));
+ }
+
+ PQclear(res);
+
+ /*
+ * Alright, time to actually perform the upgrade.
+ *
+ * We need to:
+ *
+ * - Copy remote bdr_connections entries to the local node
+ *
+ * - Upsert a row for the local node in the remote's
+ * bdr_connections
+ *
+ * - Register an on commit hook on the remote to rescan
+ * bdr_connections.
+ *
+ * - Register an on commit hook on the local side to rescan
+ * bdr_connections
+ *
+ * - set the local security label
+ *
+ * - Commit the remote transaction, adding the bdr_connections
+ * row
+ *
+ * - Return, allowing a commit to occur to save the local
+ * bdr_connections entries.
+ */
+
+ bdr_copytable(remote_conn, local_conn,
+ "COPY (SELECT * FROM bdr.bdr_connections) TO stdout",
+ "COPY bdr.bdr_connections FROM stdin");
+
+ /*
+ * Time to insert connection info about us into the remote node and ask it
+ * to connect back to us, then tell the other nodes. We don't update
+ * the remote's bdr_nodes entry for us, as the change we applied locally
+ * will get replicated.
+ *
+ * Since we have a remote conn we didn't insert our
+ * bdr_connections entry locally above. Insert it into the
+ * remote node now instead. It'll replicate back to the local
+ * node when we connect to the upstream.
+ */
+ bdr_upgrade_to_090_insert_connection(remote_conn, local_sysid_str,
+ local_timeline_str, local_dboid_str, my_conninfo);
+
+ res = PQexec(remote_conn, "SELECT bdr.bdr_connections_changed()");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ elog(ERROR, "SELECT bdr.bdr_connections_changed() on remote failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ res = PQexec(remote_conn, "INSERT INTO bdr.bdr_queued_commands\n"
+ "(lsn, queued_at, perpetrator, command_tag, command)\n"
+ "VALUES (pg_current_xlog_insert_location(), current_timestamp,\n"
+ " current_user, 'SELECT',\n"
+ " 'SELECT bdr.bdr_connections_changed()');");
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "enqueuing bdr.bdr_connections_changed() in the ddl rep queue failed: %s",
+ PQresultErrorMessage(res));
+ }
+
+ res = PQexec(local_conn, "SELECT bdr.bdr_connections_changed()");
+ if (PQresultStatus(res) != PGRES_TUPLES_OK)
+ elog(ERROR, "SELECT bdr.bdr_connections_changed() on local failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ res = PQexec(local_conn, "COMMIT");
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "COMMIT on remote failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ if (remote_conn != NULL)
+ {
+ res = PQexec(remote_conn, "COMMIT");
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ elog(ERROR, "COMMIT on remote failed: %s",
+ PQresultErrorMessage(res));
+
+ PQclear(res);
+
+ free_remote_node_info(&ri);
+ }
+
+ free_remote_node_info(&li);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&remote_conn));
+
+ PQfinish(remote_conn);
+
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(bdr_cleanup_conn_close,
+ PointerGetDatum(&local_conn));
+
+ PQfinish(local_conn);
+
+ PG_RETURN_VOID();
+}
CREATE TABLE tbl_without_oids() WITHOUT oids;
DROP TABLE tbl_without_oids;
SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
+ pg_xlog_wait_remote_apply
+---------------------------
+
+
+(2 rows)
+
--- AGGREGATE ---
\c postgres
CREATE AGGREGATE test_avg (
GRANT ALL ON SCHEMA public TO nonsuper;
\c regression
GRANT ALL ON SCHEMA public TO nonsuper;
-SELECT pg_sleep(10);
- pg_sleep
-----------
-
-(1 row)
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
- PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
- IF FOUND THEN
- RETURN;
- END IF;
-
- PERFORM bdr.bdr_replicate_ddl_command($DDL$
- CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
- AS $FUNC$
- BEGIN
- WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
- PERFORM pg_sleep(0.01);
- END LOOP;
- END;$FUNC$ LANGUAGE plpgsql;
- $DDL$);
-END;$DO$;
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
- OUT readdb1 text,
- OUT readdb2 text,
- OUT writedb1 text,
- OUT writedb2 text
- ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
- current_setting('bdrtest.readdb1'),
- current_setting('bdrtest.readdb2'),
- current_setting('bdrtest.writedb1'),
- current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
- bdr_replicate_ddl_command
----------------------------
-
-(1 row)
-
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
--- /dev/null
+\c postgres
+SELECT bdr.bdr_group_create(
+ dsn := 'dbname=postgres',
+ replication_sets := ARRAY['default', 'important', 'for-node-1']
+ );
+ bdr_group_create
+------------------
+
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready
+------------------------------
+
+(1 row)
+
+\c regression
+SELECT bdr.bdr_group_join(
+ dsn := 'dbname=regression',
+ init_from_dsn := 'dbname=postgres',
+ local_dsn := 'dbname=regression',
+ replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+ );
+ bdr_group_join
+----------------
+
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready
+------------------------------
+
+(1 row)
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type | database | active
+--------+-----------+------------+--------
+ bdr | logical | postgres | t
+ bdr | logical | regression | t
+(2 rows)
+
+SELECT count(*) FROM pg_stat_replication;
+ count
+-------
+ 2
+(1 row)
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ conn_dsn | conn_replication_sets
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status | node_local_dsn | node_init_from_dsn
+-------------+-------------------+--------------------
+ r | dbname=postgres |
+ r | dbname=regression | dbname=postgres
+(2 rows)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ conn_dsn | conn_replication_sets
+-------------------+--------------------------------------------------------------------------------------
+ dbname=postgres | {default,important,for-node-1}
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(2 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+ node_status | node_local_dsn | node_init_from_dsn
+-------------+-------------------+--------------------
+ r | dbname=postgres |
+ r | dbname=regression | dbname=postgres
+(2 rows)
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+ OUT readdb1 text,
+ OUT readdb2 text,
+ OUT writedb1 text,
+ OUT writedb2 text
+ ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+ current_setting('bdrtest.readdb1'),
+ current_setting('bdrtest.readdb2'),
+ current_setting('bdrtest.writedb1'),
+ current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command
+---------------------------
+
+(1 row)
+
--- /dev/null
+\c postgres
+SELECT bdr.bdr_subscribe(
+ remote_dsn := 'dbname=regression',
+ local_dsn := 'dbname=postgres',
+ replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+ );
+ bdr_subscribe
+---------------
+
+(1 row)
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+ bdr_node_join_wait_for_ready
+------------------------------
+
+(1 row)
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+ plugin | slot_type | database | active
+--------+-----------+------------+--------
+ bdr | logical | regression | t
+(1 row)
+
+SELECT count(*) FROM pg_stat_replication;
+ count
+-------
+ 1
+(1 row)
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+ conn_dsn | conn_replication_sets
+-------------------+--------------------------------------------------------------------------------------
+ dbname=regression | {default,important,for-node-2,for-node-2-insert,for-node-2-update,for-node-2-delete}
+(1 row)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn | node_init_from_dsn
+-------------+-----------------+--------------------
+ r | dbname=postgres | dbname=regression
+(1 row)
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+ conn_dsn | conn_replication_sets
+----------+-----------------------
+(0 rows)
+
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+ node_status | node_local_dsn | node_init_from_dsn
+-------------+----------------+--------------------
+(0 rows)
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+ PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+ IF FOUND THEN
+ RETURN;
+ END IF;
+
+ PERFORM bdr.bdr_replicate_ddl_command($DDL$
+ CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+ AS $FUNC$
+ BEGIN
+ WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+ PERFORM pg_sleep(0.01);
+ END LOOP;
+ END;$FUNC$ LANGUAGE plpgsql;
+ $DDL$);
+END;$DO$;
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+ OUT readdb1 text,
+ OUT readdb2 text,
+ OUT writedb1 text,
+ OUT writedb2 text
+ ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+ current_setting('bdrtest.readdb1'),
+ current_setting('bdrtest.readdb2'),
+ current_setting('bdrtest.writedb1'),
+ current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
+ bdr_replicate_ddl_command
+---------------------------
+
+(1 row)
+
--- /dev/null
+Parsed test spec with 3 sessions
+
+starting permutation: setup1 setup2 setup3 join_root join_2 wait_join_2 check_join_2 join_3 wait_join_3 check_join_3 wait
+step setup1:
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+
+step setup2:
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+
+step setup3:
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+
+step join_root:
+ SELECT bdr.bdr_group_create(
+ dsn := 'dbname=node1'
+ );
+
+bdr_group_create
+
+
+step join_2:
+ SELECT bdr.bdr_group_join(
+ dsn := 'dbname=node2',
+ init_from_dsn := 'dbname=node1'
+ );
+
+bdr_group_join
+
+
+step wait_join_2:
+ SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+
+step check_join_2:
+ SELECT pg_stat_clear_snapshot();
+ SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+ SELECT count(*) FROM pg_stat_replication;
+ SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+
+plugin slot_type database active
+
+bdr logical node1 t
+bdr logical node2 t
+count
+
+2
+conn_dsn conn_replication_sets
+
+dbname=node1 {default}
+dbname=node2 {default}
+node_status node_local_dsn node_init_from_dsn
+
+r dbname=node1
+r dbname=node2 dbname=node1
+step join_3:
+ SELECT bdr.bdr_group_join(
+ dsn := 'dbname=node3',
+ init_from_dsn := 'dbname=node1',
+ local_dsn := 'dbname=node3'
+ );
+
+bdr_group_join
+
+
+step wait_join_3:
+ SELECT bdr.bdr_node_join_wait_for_ready();
+
+bdr_node_join_wait_for_ready
+
+
+step check_join_3:
+ SELECT pg_stat_clear_snapshot();
+ SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+ SELECT count(*) FROM pg_stat_replication;
+ SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+pg_stat_clear_snapshot
+
+
+plugin slot_type database active
+
+bdr logical node1 t
+bdr logical node1 t
+bdr logical node2 t
+bdr logical node2 t
+bdr logical node3 t
+bdr logical node3 t
+count
+
+6
+conn_dsn conn_replication_sets
+
+dbname=node1 {default}
+dbname=node2 {default}
+dbname=node3 {default}
+node_status node_local_dsn node_init_from_dsn
+
+r dbname=node1
+r dbname=node2 dbname=node1
+r dbname=node3 dbname=node1
+step wait:
+ -- pg_xlog_wait_remote_apply isn't good enough alone
+ -- as it doesn't permit us to say how many nodes must be present.
+ -- It'll succeed if there are zero nodes. So we first have to wait
+ -- for enough replication connections.
+ DO $$
+ DECLARE
+ nodecount integer := 0;
+ target_lsn pg_lsn;
+ BEGIN
+ WHILE nodecount <> 6
+ LOOP
+ PERFORM pg_sleep(1);
+ PERFORM pg_stat_clear_snapshot();
+ -- Now find out how many walsenders are running
+ nodecount := (SELECT count(*)
+ FROM pg_catalog.pg_stat_replication);
+ RAISE NOTICE 'Found % nodes',nodecount;
+ END LOOP;
+ -- OK, all nodes seen, now we wait for catchup on them all.
+ target_lsn := pg_current_xlog_location();
+ RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+ PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+ RAISE NOTICE 'Catchup to LSN completed';
+ END;
+ $$;
+
+++ /dev/null
-Parsed test spec with 1 sessions
-
-starting permutation: wait
-step wait:
- DO $$
- DECLARE
- nodecount integer := 0;
- target_lsn pg_lsn;
- BEGIN
- WHILE nodecount <> 6
- LOOP
- PERFORM pg_sleep(1);
- PERFORM pg_stat_clear_snapshot();
- -- Now find out how many walsenders are running
- nodecount := (SELECT count(*)
- FROM pg_catalog.pg_stat_replication);
- RAISE NOTICE 'Found % nodes',nodecount;
- END LOOP;
- -- OK, all nodes seen, now we wait for catchup on them all.
- target_lsn := pg_current_xlog_location();
- RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
- PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
- RAISE NOTICE 'Catchup to LSN completed';
- END;
- $$;
-
DROP EXTENSION bdr;
CREATE EXTENSION bdr VERSION '0.9.0.0';
DROP EXTENSION bdr;
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
-- evolve version one by one from the oldest to the newest one
CREATE EXTENSION bdr VERSION '0.8.0';
ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
-- Should never have to do anything: You missed adding the new version above.
ALTER EXTENSION bdr UPDATE;
-NOTICE: version "0.9.0.0" of extension "bdr" is already installed
+NOTICE: version "0.9.0.1" of extension "bdr" is already installed
\c postgres
DROP DATABASE extension_upgrade;
COMMENT ON COLUMN bdr_nodes.node_dboid IS 'local database oid on the cluster (node_sysid, node_timeline)';
COMMENT ON COLUMN bdr_nodes.node_status IS 'Readiness of the node: [i]nitializing, [c]atchup, [r]eady. Doesn''t indicate connected/disconnected.';
+-- We don't exclude bdr_nodes with pg_extension_config_dump
+-- because this is a global table that's sync'd between nodes.
+
CREATE TABLE bdr_global_locks(
locktype text NOT NULL,
--- /dev/null
+-- Data structures for BDR's dynamic configuration management
+
+SET LOCAL search_path = bdr;
+SET bdr.permit_unsafe_ddl_commands = true;
+SET bdr.skip_ddl_replication = true;
+
+ALTER TABLE bdr.bdr_nodes
+ ADD COLUMN node_local_dsn text,
+ ADD COLUMN node_init_from_dsn text;
+
+ALTER TABLE bdr.bdr_nodes
+ DROP CONSTRAINT bdr_nodes_node_status_check;
+
+ALTER TABLE bdr.bdr_nodes
+ ADD CONSTRAINT bdr_nodes_node_status_check
+ CHECK (node_status in ('b', 'i', 'c', 'o', 'r'));
+
+CREATE TABLE bdr_connections (
+ conn_sysid text not null,
+ conn_timeline oid not null,
+ conn_dboid oid not null, -- This is an oid local to the node_sysid cluster
+
+ -- Wondering why there's no FOREIGN KEY to bdr.bdr_nodes?
+ -- bdr.bdr_nodes won't be populated when the bdr.bdr_connections
+ -- row gets created on the local node.
+
+ -- These fields may later be used by BDR to override connection
+ -- settings from one node to a particular other node. At the
+ -- moment their main use is for UDR connections, where we must
+ -- ensure that the connection is only made from one particular
+ -- node.
+ conn_origin_sysid text,
+ conn_origin_timeline oid,
+ conn_origin_dboid oid,
+
+ PRIMARY KEY(conn_sysid, conn_timeline, conn_dboid,
+ conn_origin_sysid, conn_origin_timeline, conn_origin_dboid),
+
+ -- Either a whole origin ID (for an override or UDR entry) or no
+ -- origin ID may be provided.
+ CONSTRAINT origin_all_or_none_null
+ CHECK ((conn_origin_sysid = '0') = (conn_origin_timeline = 0)
+ AND (conn_origin_sysid = '0') = (conn_origin_dboid = 0)),
+
+ -- Indicates that this connection is unidirectional; there won't be
+ -- a corresponding inbound connection from the peer node. Only permitted
+ -- where the conn_origin fields are set.
+ conn_is_unidirectional boolean not null default false,
+
+ CONSTRAINT unidirectional_conn_must_have_origin
+ CHECK ((NOT conn_is_unidirectional) OR (conn_origin_sysid <> '0')),
+
+ conn_dsn text not null,
+
+ conn_apply_delay integer
+ CHECK (conn_apply_delay >= 0),
+
+ conn_replication_sets text[]
+);
+
+REVOKE ALL ON TABLE bdr_connections FROM public;
+
+COMMENT ON TABLE bdr_connections IS 'Connection information for nodes in the group. Don''t modify this directly, use the provided functions. One entry should exist per node in the group.';
+
+COMMENT ON COLUMN bdr_connections.conn_sysid IS 'System identifer for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_timeline IS 'System timeline ID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_dboid IS 'System database OID for the node this entry''s dsn refers to';
+COMMENT ON COLUMN bdr_connections.conn_origin_sysid IS 'If set, ignore this entry unless the local sysid is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_timeline IS 'If set, ignore this entry unless the local timeline is this';
+COMMENT ON COLUMN bdr_connections.conn_origin_dboid IS 'If set, ignore this entry unless the local dboid is this';
+COMMENT ON COLUMN bdr_connections.conn_dsn IS 'A libpq-style connection string specifying how to make a connection to this node from other nodes.';
+COMMENT ON COLUMN bdr_connections.conn_apply_delay IS 'If set, milliseconds to wait before applying each transaction from the remote node. Mainly for debugging. If null, the global default applies.';
+COMMENT ON COLUMN bdr_connections.conn_replication_sets IS 'Replication sets this connection should participate in, if non-default.';
+
+SELECT pg_catalog.pg_extension_config_dump('bdr_connections', '');
+
+CREATE FUNCTION bdr_connections_changed()
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_connections_changed() FROM public;
+
+COMMENT ON FUNCTION bdr_connections_changed() IS 'Internal BDR function, do not call directly.';
+
+
+--
+-- This is a helper for node_join, for internal use only. It's called
+-- on the remote end by the init code when joining an existing group,
+-- to do the remote-side setup.
+--
+CREATE FUNCTION bdr.internal_node_join(
+ sysid text, timeline oid, dboid oid,
+ dsn text,
+ apply_delay integer,
+ replication_sets text[]
+ )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+AS
+$body$
+DECLARE
+ status "char";
+BEGIN
+ LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+ LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+ IF bdr_variant() <> 'BDR' THEN
+ RAISE USING
+ MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+ DETAIL = 'The target node is running something other than full BDR so you cannot join a BDR node to it',
+ HINT = 'Install full BDR if possible or use the UDR functions.',
+ ERRCODE = 'feature_not_supported';
+ END IF;
+
+ -- Assert that we have a bdr_nodes entry with state = i on this node
+ SELECT INTO status
+ FROM bdr.bdr_nodes
+ WHERE node_sysid = sysid
+ AND node_timeline = timeline
+ AND node_dboid = dboid;
+
+ IF NOT FOUND THEN
+ RAISE object_not_in_prerequisite_state
+ USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) not found',
+ sysid, timeline, dboid);
+ END IF;
+
+ IF status <> 'i' THEN
+ RAISE object_not_in_prerequisite_state
+ USING MESSAGE = format('bdr.bdr_nodes entry for (%s,%s,%s) has unexpected status %L (expected ''i'')',
+ sysid, timeline, dboid, status);
+ END IF;
+
+ -- Insert or Update the connection info on this node, which we must be
+ -- initing from.
+ -- No need to care about concurrency here as we hold EXCLUSIVE LOCK.
+ BEGIN
+ INSERT INTO bdr.bdr_connections
+ (conn_sysid, conn_timeline, conn_dboid,
+ conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+ conn_dsn,
+ conn_apply_delay, conn_replication_sets,
+ conn_is_unidirectional)
+ VALUES
+ (sysid, timeline, dboid,
+ '0', 0, 0,
+ dsn,
+ CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+ replication_sets, false);
+ EXCEPTION WHEN unique_violation THEN
+ UPDATE bdr.bdr_connections
+ SET conn_dsn = dsn,
+ conn_apply_delay = CASE WHEN apply_delay = -1 THEN NULL ELSE apply_delay END,
+ conn_replication_sets = replication_sets,
+ conn_is_unidirectional = false
+ WHERE conn_sysid = sysid
+ AND conn_timeline = timeline
+ AND conn_dboid = dboid
+ AND conn_origin_sysid = '0'
+ AND conn_origin_timeline = 0
+ AND conn_origin_dboid = 0;
+ END;
+
+ -- Schedule the apply worker launch for commit time
+ PERFORM bdr.bdr_connections_changed();
+
+ -- and ensure the apply worker is launched on other nodes
+ -- when this transaction replicates there, too.
+ INSERT INTO bdr.bdr_queued_commands
+ (lsn, queued_at, perpetrator, command_tag, command)
+ VALUES
+ (pg_current_xlog_insert_location(), current_timestamp, current_user,
+ 'SELECT', 'SELECT bdr.bdr_connections_changed()');
+END;
+$body$;
+
+
+CREATE FUNCTION bdr.internal_update_seclabel()
+RETURNS void LANGUAGE plpgsql
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+ v_label json;
+BEGIN
+ -- Update 'bdr' parameter in the current label if there's one.
+ -- (Right now there's not much point to this but later we'll be
+ -- possibly have more information in there.)
+
+ -- first select existing label
+ SELECT label::json INTO v_label
+ FROM pg_catalog.pg_shseclabel
+ WHERE provider = 'bdr'
+ AND classoid = 'pg_database'::regclass
+ AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+ -- then replace 'bdr' with 'bdr'::true
+ SELECT json_object_agg(key, value) INTO v_label
+ FROM (
+ SELECT key, value
+ FROM json_each(v_label)
+ WHERE key <> 'bdr'
+ UNION ALL
+ SELECT 'bdr', to_json(true)
+ ) d;
+
+ -- and set the newly computed label
+ -- (It's safe to do this early, it won't take effect
+ -- until commit)
+ EXECUTE format('SECURITY LABEL FOR bdr ON DATABASE %I IS %L',
+ current_database(), v_label);
+END;
+$body$;
+
+-- Setup that's common to BDR and UDR joins
+CREATE FUNCTION bdr.internal_begin_join(caller text, local_dsn text, remote_dsn text,
+ remote_sysid OUT text, remote_timeline OUT oid, remote_dboid OUT oid
+)
+RETURNS record LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+ localid RECORD;
+ localid_from_dsn RECORD;
+ remote_nodeinfo RECORD;
+BEGIN
+ -- Only one tx can be adding connections
+ LOCK TABLE bdr.bdr_connections IN EXCLUSIVE MODE;
+ LOCK TABLE bdr.bdr_nodes IN EXCLUSIVE MODE;
+ LOCK TABLE pg_catalog.pg_shseclabel IN EXCLUSIVE MODE;
+
+ SELECT sysid, timeline, dboid INTO localid
+ FROM bdr.bdr_get_local_nodeid();
+
+ -- If there's already an entry for ourselves in bdr.bdr_connections
+ -- then we know this node is part of an active BDR group and cannot
+ -- be joined to another group. Unidirectional connections are ignored.
+ PERFORM 1 FROM bdr_connections
+ WHERE conn_sysid = localid.sysid
+ AND conn_timeline = localid.timeline
+ AND conn_dboid = localid.dboid
+ AND (conn_origin_sysid = '0'
+ AND conn_origin_timeline = 0
+ AND conn_origin_dboid = 0)
+ AND conn_is_unidirectional = 'f';
+
+ IF FOUND THEN
+ RAISE USING
+ MESSAGE = 'This node is already a member of a BDR group',
+ HINT = 'Connect to the node you wish to add and run '||caller||' from it instead',
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ -- Validate that the local connection is usable and matches
+ -- the node identity of the node we're running on.
+ --
+ -- For BDR this will NOT check the 'dsn' if 'local_dsn'
+ -- gets supplied. We don't know if 'dsn' is even valid
+ -- for loopback connections and can't assume it is. That'll
+ -- get checked later by BDR specific code.
+ SELECT * INTO localid_from_dsn
+ FROM bdr_get_remote_nodeinfo(local_dsn);
+
+ IF localid_from_dsn.sysid <> localid.sysid
+ OR localid_from_dsn.timeline <> localid.timeline
+ OR localid_from_dsn.dboid <> localid.dboid
+ THEN
+ RAISE USING
+ MESSAGE = 'node identity for local dsn does not match current node',
+ DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+ local_dsn, localid_from_dsn.sysid, localid_from_dsn.timeline,
+ localid_from_dsn.dboid, localid.sysid, localid.timeline, localid.dboid),
+ HINT = 'The local_dsn (or, for bdr, dsn if local_dsn is null) parameter must refer to the node you''re running this function from',
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ IF NOT localid_from_dsn.is_superuser THEN
+ RAISE USING
+ MESSAGE = 'local dsn does not have superuser rights',
+ DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, local_dsn),
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ -- Now interrogate the remote node, if specified, and sanity
+ -- check its connection too. The discovered node identity is
+ -- returned if found.
+ --
+ -- This will error out if there are issues with the remote
+ -- node.
+ IF remote_dsn IS NOT NULL THEN
+ SELECT * INTO remote_nodeinfo
+ FROM bdr_get_remote_nodeinfo(remote_dsn);
+
+ remote_sysid := remote_nodeinfo.sysid;
+ remote_timeline := remote_nodeinfo.timeline;
+ remote_dboid := remote_nodeinfo.dboid;
+
+ IF NOT remote_nodeinfo.is_superuser THEN
+ RAISE USING
+ MESSAGE = 'connection to remote node does not have superuser rights',
+ DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, remote_dsn),
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ IF remote_nodeinfo.version_num < bdr_min_remote_version_num() THEN
+ RAISE USING
+ MESSAGE = 'remote node''s BDR version is too old',
+ DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s is less than the required version %s$$,
+ remote_dsn, remote_nodeinfo.version_num, bdr_min_remote_version_num()),
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ IF remote_nodeinfo.min_remote_version_num > bdr_version_num() THEN
+ RAISE USING
+ MESSAGE = 'remote node''s BDR version is too new or this node''s version is too old',
+ DETAIL = format($$The dsn '%s' connects successfully but the remote node version %s requires this node to run at least bdr %s, not the current %s$$,
+ remote_dsn, remote_nodeinfo.version_num, remote_nodeinfo.min_remote_version_num,
+ bdr_min_remote_version_num()),
+ ERRCODE = 'object_not_in_prerequisite_state';
+
+ END IF;
+
+ END IF;
+
+ -- Create local node record if needed
+ PERFORM 1 FROM bdr_nodes
+ WHERE node_sysid = localid.sysid
+ AND node_timeline = localid.timeline
+ AND node_dboid = localid.dboid;
+
+ IF NOT FOUND THEN
+ INSERT INTO bdr_nodes (
+ node_sysid, node_timeline, node_dboid,
+ node_status, node_local_dsn, node_init_from_dsn
+ ) VALUES (
+ localid.sysid, localid.timeline, localid.dboid,
+ 'b', local_dsn, remote_dsn
+ );
+ END IF;
+
+ PERFORM bdr.internal_update_seclabel();
+END;
+$body$;
+
+--
+-- The public interface for node join/addition, to be run to join a currently
+-- unconnected node with a blank database to a BDR group.
+--
+CREATE FUNCTION bdr.bdr_group_join(
+ dsn text,
+ init_from_dsn text,
+ local_dsn text DEFAULT NULL,
+ apply_delay integer DEFAULT NULL,
+ replication_sets text[] DEFAULT ARRAY['default']
+ )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+ localid record;
+ connectback_nodeinfo record;
+ remoteinfo record;
+BEGIN
+ IF dsn IS NULL THEN
+ RAISE USING
+ MESSAGE = 'dsn may not be null',
+ ERRCODE = 'invalid_parameter_value';
+ END IF;
+
+ IF bdr_variant() <> 'BDR' THEN
+ RAISE USING
+ MESSAGE = 'Full BDR required but this module is built for '||bdr_variant(),
+ DETAIL = 'The local node is not running full BDR, which is required to use bdr_join',
+ HINT = 'Install full BDR if possible or use the UDR functions.',
+ ERRCODE = 'feature_not_supported';
+ END IF;
+
+ PERFORM bdr.internal_begin_join(
+ 'bdr_group_join',
+ CASE WHEN local_dsn IS NULL THEN dsn ELSE local_dsn END,
+ init_from_dsn);
+
+ SELECT sysid, timeline, dboid INTO localid
+ FROM bdr.bdr_get_local_nodeid();
+
+ -- Request additional connection tests to determine that the remote is
+ -- reachable for replication and non-replication mode and that the remote
+ -- can connect back to us via 'dsn' on non-replication and replication
+ -- modes.
+ --
+ -- This cannot be checked for the first node since there's no peer
+ -- to ask for help.
+ IF init_from_dsn IS NOT NULL THEN
+
+ SELECT * INTO connectback_nodeinfo
+ FROM bdr.bdr_test_remote_connectback(init_from_dsn, dsn);
+
+ -- The connectback must actually match our local node identity
+ -- and must provide a superuser connection.
+ IF NOT connectback_nodeinfo.is_superuser THEN
+ RAISE USING
+ MESSAGE = 'dsn does not have superuser rights when connecting via remote node',
+ DETAIL = format($$The dsn '%s' connects successfully but does not grant superuser rights$$, dsn),
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ IF connectback_nodeinfo.sysid <> localid.sysid
+ OR connectback_nodeinfo.timeline <> localid.timeline
+ OR connectback_nodeinfo.dboid <> localid.dboid
+ THEN
+ RAISE USING
+ MESSAGE = 'node identity for dsn does not match current node when connecting back via remote',
+ DETAIL = format($$The dsn '%s' connects to a node with identity (%s,%s,%s) but the local node is (%s,%s,%s)$$,
+ local_dsn, connectback_nodeinfo.sysid, connectback_nodeinfo.timeline,
+ connectback_nodeinfo.dboid, localid.sysid, localid.timeline, localid.dboid),
+ HINT = 'The ''dsn'' parameter must refer to the node you''re running this function from, from the perspective of the node pointed to by init_from_dsn',
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+ END IF;
+
+ -- Null/empty checks are skipped, the underlying constraints on the table
+ -- will catch that for us.
+ INSERT INTO bdr.bdr_connections (
+ conn_sysid, conn_timeline, conn_dboid,
+ conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+ conn_dsn, conn_apply_delay, conn_replication_sets,
+ conn_is_unidirectional
+ ) VALUES (
+ localid.sysid, localid.timeline, localid.dboid,
+ '0', 0, 0,
+ dsn, apply_delay, replication_sets, false
+ );
+
+ -- Now ensure the per-db worker is started if it's not already running.
+ -- This won't actually take effect until commit time, it just adds a commit
+ -- hook to start the worker when we commit.
+ PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_join(text,text,text,integer,text[])
+IS 'Join an existing BDR group by connecting to a member node and copying its contents';
+
+CREATE FUNCTION bdr.bdr_group_create(
+ dsn text,
+ local_dsn text DEFAULT NULL,
+ apply_delay integer DEFAULT NULL,
+ replication_sets text[] DEFAULT ARRAY['default']
+ )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+BEGIN
+ PERFORM bdr.bdr_group_join(
+ dsn, init_from_dsn := null, local_dsn := local_dsn,
+ apply_delay := apply_delay,
+ replication_sets := replication_sets);
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_group_create(text,text,integer,text[])
+IS 'Create a BDR group, turning a stand-alone database into the first node in a BDR group';
+
+--
+-- The public interface for unidirectional replication setup.
+--
+CREATE FUNCTION bdr.bdr_subscribe(
+ remote_dsn text,
+ local_dsn text,
+ apply_delay integer DEFAULT NULL,
+ replication_sets text[] DEFAULT ARRAY['default']
+ )
+RETURNS void LANGUAGE plpgsql VOLATILE
+SET search_path = bdr, pg_catalog
+SET bdr.permit_unsafe_ddl_commands = on
+SET bdr.skip_ddl_replication = on
+SET bdr.skip_ddl_locking = on
+AS $body$
+DECLARE
+ localid record;
+ remoteid record;
+BEGIN
+ IF local_dsn IS NULL THEN
+ RAISE USING
+ MESSAGE = 'local_dsn may not be null',
+ ERRCODE = 'invalid_parameter_value';
+ END IF;
+
+ IF remote_dsn IS NULL THEN
+ RAISE USING
+ MESSAGE = 'remote may not be null',
+ ERRCODE = 'invalid_parameter_value';
+ END IF;
+
+ SELECT remote_sysid AS sysid, remote_timeline AS timeline,
+ remote_dboid AS dboid INTO remoteid
+ FROM bdr.internal_begin_join('bdr_subscribe', local_dsn, remote_dsn);
+
+ SELECT sysid, timeline, dboid INTO localid
+ FROM bdr.bdr_get_local_nodeid();
+
+ PERFORM 1 FROM bdr_connections
+ WHERE conn_sysid = remoteid.sysid
+ AND conn_timeline = remoteid.timeline
+ AND conn_dboid = remoteid.dboid
+ AND conn_origin_sysid = localid.sysid
+ AND conn_origin_timeline = localid.timeline
+ AND conn_origin_dboid = localid.dboid
+ AND conn_is_unidirectional = 't';
+
+ IF FOUND THEN
+ RAISE USING
+ MESSAGE = 'This node is already connected to given remote node',
+ ERRCODE = 'object_not_in_prerequisite_state';
+ END IF;
+
+ -- Null/empty checks are skipped, the underlying constraints on the table
+ -- will catch that for us.
+ INSERT INTO bdr.bdr_connections (
+ conn_sysid, conn_timeline, conn_dboid,
+ conn_origin_sysid, conn_origin_timeline, conn_origin_dboid,
+ conn_dsn, conn_apply_delay, conn_replication_sets,
+ conn_is_unidirectional
+ ) VALUES (
+ remoteid.sysid, remoteid.timeline, remoteid.dboid,
+ localid.sysid, localid.timeline, localid.dboid,
+ remote_dsn, apply_delay, replication_sets, true
+ );
+
+ -- Now ensure the per-db worker is started if it's not already running.
+ -- This won't actually take effect until commit time, it just adds a commit
+ -- hook to start the worker when we commit.
+ PERFORM bdr.bdr_connections_changed();
+END;
+$body$;
+
+COMMENT ON FUNCTION bdr.bdr_subscribe(text,text,integer,text[])
+IS 'Subscribe to remote logical changes';
+
+CREATE FUNCTION bdr.bdr_node_join_wait_for_ready()
+RETURNS void LANGUAGE plpgsql VOLATILE AS $body$
+DECLARE
+ _node_status "char";
+BEGIN
+ IF current_setting('transaction_isolation') <> 'read committed' THEN
+ RAISE EXCEPTION 'Can only wait for node join in an ISOLATION LEVEL READ COMMITTED transaction, not %',
+ current_setting('transaction_isolation');
+ END IF;
+
+ LOOP
+ SELECT INTO _node_status
+ node_status
+ FROM bdr.bdr_nodes
+ WHERE (node_sysid, node_timeline, node_dboid)
+ = bdr.bdr_get_local_nodeid();
+
+ PERFORM pg_sleep(0.5);
+
+ EXIT WHEN _node_status = 'r';
+ END LOOP;
+END;
+$body$;
+
+CREATE FUNCTION bdr_upgrade_to_090(my_conninfo cstring, local_conninfo cstring, remote_conninfo cstring)
+RETURNS void LANGUAGE c AS 'MODULE_PATHNAME';
+
+REVOKE ALL ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring) FROM public;
+
+COMMENT ON FUNCTION bdr_upgrade_to_090(cstring,cstring,cstring)
+IS 'Upgrade a BDR 0.7.x or 0.8.x node to BDR 0.9.0 dynamic configuration. remote_conninfo is the node to connect to to perform the upgrade, my_conninfo is the dsn for other nodes to connect to this node with, local_conninfo is used to connect locally back to the node. Use null remote conninfo on the first node.';
+
+RESET bdr.permit_unsafe_ddl_commands;
+RESET bdr.skip_ddl_replication;
+RESET search_path;
errlog()
{
- echo "$@" 1>&2
+ echo "$@" 1>&2
}
JOBS=1
PGRESTORE=
while (($i < ${#argv[*]})); do
- case "${argv[$i]}" in
+ case "${argv[$i]}" in
-V)
- echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
- exit
+ echo "bdr_initial_load (PostgreSQL PG_VERSION, BDR BDR_VERSION)"
+ exit
;;
- --snapshot)
- ((i++)); SNAPSHOT="${argv[$i]}"
- ;;
- --source)
- ((i++)); SOURCE="${argv[$i]}"
- ;;
- --target)
- ((i++)); TARGET="${argv[$i]}"
- ;;
- --tmp-directory)
- ((i++)); TMPDIR="${argv[$i]}"
- ;;
- --jobs)
- ((i++)); JOBS="${argv[$i]}"
+ --snapshot)
+ ((i++)); SNAPSHOT="${argv[$i]}"
;;
- --pg-dump-path)
- ((i++)); PGDUMP="${argv[$i]}"
- ;;
- --pg-restore-path)
- ((i++)); PGRESTORE="${argv[$i]}"
- ;;
- --help)
- errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
- errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
- exit 0
- ;;
- *)
- errlog Unknown command-line option: ${argv[$i]}
- exit 1
- ;;
- esac
+ --source)
+ ((i++)); SOURCE="${argv[$i]}"
+ ;;
+ --target)
+ ((i++)); TARGET="${argv[$i]}"
+ ;;
+ --tmp-directory)
+ ((i++)); TMPDIR="${argv[$i]}"
+ ;;
+ --jobs)
+ ((i++)); JOBS="${argv[$i]}"
+ ;;
+ --pg-dump-path)
+ ((i++)); PGDUMP="${argv[$i]}"
+ ;;
+ --pg-restore-path)
+ ((i++)); PGRESTORE="${argv[$i]}"
+ ;;
+ --help)
+ errlog "Usage: bdr_replica --source <dsn> --target <dsn> [--snapshot <name>] --dir /path/to/dir [--jobs N]"
+ errlog "<dsn> is a libpq conninfo string, e.g. \"host=/tmp post=5433 dbname=xxx\""
+ exit 0
+ ;;
+ *)
+ errlog Unknown command-line option: ${argv[$i]}
+ exit 1
+ ;;
+ esac
- ((i++))
+ ((i++))
done
if [ -z "$SOURCE" ]; then
- errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
+ errlog Please specify a source DSN with '--source "port=nnn dbname=xxx"'; exit 1
fi
if [ -z "$TARGET" ]; then
- errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
+ errlog Please specify a target DSN with '--target "port=nnn dbname=xxx"'; exit 1
fi
if [ -z "$TMPDIR" ]; then
- errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
+ errlog Please specify a directory with '--temp-directory /path/to/dir'; exit 1
fi
if [ -z "$PGDUMP" ]; then
- errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+ errlog The path to pg_dump must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
fi
if [ -z "$PGRESTORE" ]; then
- errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
+ errlog The path to pg_restore must be specified with '--pg-dump-path ./path/pg_dump'; exit 1
fi
SNAP=${SNAPSHOT:+"--snapshot $SNAPSHOT"}
errlog "Dumping remote database \"$SOURCE\" with $JOBS concurrent workers to \"$TMPDIR\""
-if ! "$PGDUMP" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
- errlog "bdr_dump of "$SOURCE" failed, aborting"
- exit 1
+if ! "$PGDUMP" -T "bdr.bdr_nodes" -T "bdr.bdr_connections" -j $JOBS $SNAP -F d -f $TMPDIR "$SOURCE"; then
+ errlog "bdr_dump of "$SOURCE" failed, aborting"
+ exit 1
fi
errlog "Restoring dump to local DB \"$TARGET\" with $JOBS concurrent workers from \"$TMPDIR\""
-if ! "$PGRESTORE" --exit-on-error --single-transaction -j $JOBS -F d -d "$TARGET" $TMPDIR; then
- errlog "pg_restore to "$TARGET" failed, aborting"
- exit 2
+if ! "$PGRESTORE" --exit-on-error -j $JOBS -F d -d "$TARGET" $TMPDIR; then
+ errlog "pg_restore to "$TARGET" failed, aborting"
+ exit 2
fi
exit 0
--- /dev/null
+conninfo "node1" "dbname=node1"
+conninfo "node2" "dbname=node2"
+conninfo "node3" "dbname=node3"
+
+session "snode1"
+
+# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
+# say how many nodes must be present. It'll succeed if there are zero nodes.
+# So we first have to wait for enough replication connections.
+#
+# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
+# cached when first accessed so repeat access within the same transaction sees
+# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
+# pg_stat_activity, new walsenders are filtered out by the join unles we force
+# a refresh of pg_stat_activity.
+
+connection "node1"
+
+step "setup1"
+{
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+}
+
+
+step "join_root"
+{
+ SELECT bdr.bdr_group_create(
+ dsn := 'dbname=node1'
+ );
+}
+
+step "wait"
+{
+ -- pg_xlog_wait_remote_apply isn't good enough alone
+ -- as it doesn't permit us to say how many nodes must be present.
+ -- It'll succeed if there are zero nodes. So we first have to wait
+ -- for enough replication connections.
+ DO $$
+ DECLARE
+ nodecount integer := 0;
+ target_lsn pg_lsn;
+ BEGIN
+ WHILE nodecount <> 6
+ LOOP
+ PERFORM pg_sleep(1);
+ PERFORM pg_stat_clear_snapshot();
+ -- Now find out how many walsenders are running
+ nodecount := (SELECT count(*)
+ FROM pg_catalog.pg_stat_replication);
+ RAISE NOTICE 'Found % nodes',nodecount;
+ END LOOP;
+ -- OK, all nodes seen, now we wait for catchup on them all.
+ target_lsn := pg_current_xlog_location();
+ RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
+ PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
+ RAISE NOTICE 'Catchup to LSN completed';
+ END;
+ $$;
+}
+
+session "snode2"
+connection "node2"
+
+step "setup2"
+{
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+}
+
+
+step "join_2"
+{
+ SELECT bdr.bdr_group_join(
+ dsn := 'dbname=node2',
+ init_from_dsn := 'dbname=node1'
+ );
+}
+
+step "wait_join_2"
+{
+ SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_2"
+{
+ SELECT pg_stat_clear_snapshot();
+ SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+ SELECT count(*) FROM pg_stat_replication;
+ SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+session "snode3"
+connection "node3"
+
+step "setup3"
+{
+ CREATE EXTENSION btree_gist;
+ CREATE EXTENSION bdr;
+}
+
+
+step "join_3"
+{
+ SELECT bdr.bdr_group_join(
+ dsn := 'dbname=node3',
+ init_from_dsn := 'dbname=node1',
+ local_dsn := 'dbname=node3'
+ );
+}
+
+step "wait_join_3"
+{
+ SELECT bdr.bdr_node_join_wait_for_ready();
+}
+
+step "check_join_3"
+{
+ SELECT pg_stat_clear_snapshot();
+ SELECT plugin, slot_type, database, active FROM pg_replication_slots ORDER BY plugin, slot_type, database;
+ SELECT count(*) FROM pg_stat_replication;
+ SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+ SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+}
+
+permutation "setup1" "setup2" "setup3" "join_root" "join_2" "wait_join_2" "check_join_2" "join_3" "wait_join_3" "check_join_3" "wait"
+++ /dev/null
-conninfo "node1" "dbname=node1"
-conninfo "node2" "dbname=node2"
-conninfo "node3" "dbname=node3"
-
-session "snode1"
-
-# pg_xlog_wait_remote_apply isn't good enough alone as it doesn't permit us to
-# say how many nodes must be present. It'll succeed if there are zero nodes.
-# So we first have to wait for enough replication connections.
-#
-# The reason why we call pg_stat_clear_snapshot() is that pg_stat_activity is
-# cached when first accessed so repeat access within the same transaction sees
-# unchanging results. As pg_stat_replication joins pg_stat_get_wal_senders() on
-# pg_stat_activity, new walsenders are filtered out by the join unles we force
-# a refresh of pg_stat_activity.
-
-step "wait"
-{
- DO $$
- DECLARE
- nodecount integer := 0;
- target_lsn pg_lsn;
- BEGIN
- WHILE nodecount <> 6
- LOOP
- PERFORM pg_sleep(1);
- PERFORM pg_stat_clear_snapshot();
- -- Now find out how many walsenders are running
- nodecount := (SELECT count(*)
- FROM pg_catalog.pg_stat_replication);
- RAISE NOTICE 'Found % nodes',nodecount;
- END LOOP;
- -- OK, all nodes seen, now we wait for catchup on them all.
- target_lsn := pg_current_xlog_location();
- RAISE NOTICE 'Found expected % nodes, waiting for xlog catchup to %', 6, target_lsn;
- PERFORM pg_xlog_wait_remote_apply( target_lsn, 0 );
- RAISE NOTICE 'Catchup to LSN completed';
- END;
- $$;
-}
-
-permutation "wait"
CREATE TABLE tbl_without_oids() WITHOUT oids;
DROP TABLE tbl_without_oids;
SET default_with_oids = false;
+SELECT pg_xlog_wait_remote_apply(pg_current_xlog_location(), pid) FROM pg_stat_replication;
--- AGGREGATE ---
\c postgres
\c regression
GRANT ALL ON SCHEMA public TO nonsuper;
-SELECT pg_sleep(10);
-
--- emulate the pg_xlog_wait_remote_apply on vanilla postgres
-DO $DO$BEGIN
- PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
- IF FOUND THEN
- RETURN;
- END IF;
-
- PERFORM bdr.bdr_replicate_ddl_command($DDL$
- CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
- AS $FUNC$
- BEGIN
- WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
- PERFORM pg_sleep(0.01);
- END LOOP;
- END;$FUNC$ LANGUAGE plpgsql;
- $DDL$);
-END;$DO$;
+\c postgres
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
-SELECT bdr.bdr_replicate_ddl_command($DDL$
-CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
- OUT readdb1 text,
- OUT readdb2 text,
- OUT writedb1 text,
- OUT writedb2 text
- ) RETURNS record LANGUAGE SQL AS $f$
-SELECT
- current_setting('bdrtest.readdb1'),
- current_setting('bdrtest.readdb2'),
- current_setting('bdrtest.writedb1'),
- current_setting('bdrtest.writedb2')
-$f$;
-$DDL$);
+\c regression
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr;
--- /dev/null
+\c postgres
+SELECT bdr.bdr_group_create(
+ dsn := 'dbname=postgres',
+ replication_sets := ARRAY['default', 'important', 'for-node-1']
+ );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+\c regression
+SELECT bdr.bdr_group_join(
+ dsn := 'dbname=regression',
+ init_from_dsn := 'dbname=postgres',
+ local_dsn := 'dbname=regression',
+ replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+ );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see two slots and two active connections
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+\c postgres
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections ORDER BY conn_dsn;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes ORDER BY node_local_dsn;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+ OUT readdb1 text,
+ OUT readdb2 text,
+ OUT writedb1 text,
+ OUT writedb2 text
+ ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+ current_setting('bdrtest.readdb1'),
+ current_setting('bdrtest.readdb2'),
+ current_setting('bdrtest.writedb1'),
+ current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
--- /dev/null
+\c postgres
+SELECT bdr.bdr_subscribe(
+ remote_dsn := 'dbname=regression',
+ local_dsn := 'dbname=postgres',
+ replication_sets := ARRAY['default', 'important', 'for-node-2', 'for-node-2-insert', 'for-node-2-update', 'for-node-2-delete']
+ );
+
+SELECT bdr.bdr_node_join_wait_for_ready();
+
+-- Make sure we see the slot and active connection
+SELECT plugin, slot_type, database, active FROM pg_replication_slots;
+SELECT count(*) FROM pg_stat_replication;
+
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+\c regression
+SELECT conn_dsn, conn_replication_sets FROM bdr.bdr_connections;
+SELECT node_status, node_local_dsn, node_init_from_dsn FROM bdr.bdr_nodes;
+
+-- emulate the pg_xlog_wait_remote_apply on vanilla postgres
+DO $DO$BEGIN
+ PERFORM 1 FROM pg_proc WHERE proname = 'pg_xlog_wait_remote_apply';
+ IF FOUND THEN
+ RETURN;
+ END IF;
+
+ PERFORM bdr.bdr_replicate_ddl_command($DDL$
+ CREATE OR REPLACE FUNCTION public.pg_xlog_wait_remote_apply(i_pos pg_lsn, i_pid integer) RETURNS VOID
+ AS $FUNC$
+ BEGIN
+ WHILE EXISTS(SELECT true FROM pg_stat_get_wal_senders() s WHERE s.flush_location < i_pos AND (i_pid = 0 OR s.pid = i_pid)) LOOP
+ PERFORM pg_sleep(0.01);
+ END LOOP;
+ END;$FUNC$ LANGUAGE plpgsql;
+ $DDL$);
+END;$DO$;
+
+SELECT bdr.bdr_replicate_ddl_command($DDL$
+CREATE OR REPLACE FUNCTION public.bdr_regress_variables(
+ OUT readdb1 text,
+ OUT readdb2 text,
+ OUT writedb1 text,
+ OUT writedb2 text
+ ) RETURNS record LANGUAGE SQL AS $f$
+SELECT
+ current_setting('bdrtest.readdb1'),
+ current_setting('bdrtest.readdb2'),
+ current_setting('bdrtest.writedb1'),
+ current_setting('bdrtest.writedb2')
+$f$;
+$DDL$);
CREATE EXTENSION bdr VERSION '0.9.0.0';
DROP EXTENSION bdr;
+CREATE EXTENSION bdr VERSION '0.9.0.1';
+DROP EXTENSION bdr;
+
-- evolve version one by one from the oldest to the newest one
CREATE EXTENSION bdr VERSION '0.8.0';
ALTER EXTENSION bdr UPDATE TO '0.8.0.1';
ALTER EXTENSION bdr UPDATE TO '0.8.0.6';
ALTER EXTENSION bdr UPDATE TO '0.8.0.7';
ALTER EXTENSION bdr UPDATE TO '0.9.0.0';
+ALTER EXTENSION bdr UPDATE TO '0.9.0.1';
-- Should never have to do anything: You missed adding the new version above.
--- /dev/null
+--
+-- Attempt to simulate an upgrade from BDR 0.8.0 to the current
+-- version.
+--
+-- 0.8.0 used GUCs for bdr.connections DSN configuration, etc. We can manually
+-- create the slots, replication identifiers, and bdr.bdr_nodes entries as if
+-- this was a 0.8.0 DB just about to be upgraded, then upgrade the extension
+-- and execute the upgrade process.
+--
+
+
+CREATE DATABASE upgrade_sim_0800_a;
+CREATE DATABASE upgrade_sim_0800_b;
+
+\c upgrade_sim_0800_a;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_a --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+-- public.bdr_get_local_nodeid() is defined in the bdr ext's C lib
+-- exposed in 0.8.0's SQL extension. We have to use it to create
+-- the required slots etc, so create it in public.
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+ id integer primary key,
+ somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+ format('bdr_%s_%s_%s_%s__%s',
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+ sysid, timeline,
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+ ''
+ )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+ sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+ public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+ format('bdr_%s_%s_%s_%s__%s',
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+ sysid, timeline,
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+ ''
+ ),
+ 'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+
+\c upgrade_sim_0800_b;
+------------------------------------------
+-- Prepare node upgrade_sim_0800_b --
+------------------------------------------
+
+CREATE EXTENSION btree_gist;
+CREATE EXTENSION bdr VERSION '0.8.0';
+
+CREATE FUNCTION public.bdr_get_local_nodeid( sysid OUT text, timeline OUT oid, dboid OUT oid)
+RETURNS record LANGUAGE c AS 'bdr';
+
+CREATE TABLE dummytable(
+ id integer primary key,
+ somevalue text
+);
+
+INSERT INTO dummytable(id, somevalue) VALUES (1, '42'), (2, 'fred');
+
+SELECT pg_replication_identifier_create(
+ format('bdr_%s_%s_%s_%s__%s',
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+ sysid, timeline,
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+ ''
+ )
+)
+FROM public.bdr_get_local_nodeid();
+
+INSERT INTO bdr.bdr_nodes
+(node_sysid, node_timeline, node_dboid, node_status)
+SELECT
+ sysid, timeline, (SELECT oid FROM pg_database WHERE datname = dn), 'r'
+FROM (VALUES ('upgrade_sim_0800_a'), ('upgrade_sim_0800_b')) x(dn),
+ public.bdr_get_local_nodeid();
+
+SELECT pg_create_logical_replication_slot(
+ format('bdr_%s_%s_%s_%s__%s',
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_a'),
+ sysid, timeline,
+ (SELECT oid FROM pg_database WHERE datname = 'upgrade_sim_0800_b'),
+ ''
+ ),
+ 'bdr')
+FROM public.bdr_get_local_nodeid();
+
+DROP FUNCTION public.bdr_get_local_nodeid();
+
+
+
+
+
+------------------------------------------
+-- Test the upgrade --
+------------------------------------------
+--
+-- We now have two databases that look like they were running BDR, with
+-- contents in sync at the time of upgrade. The origin replication identifier
+-- information is wrong as both have InvalidRepNodeId but we don't really care
+-- about that. It's as if we deleted bdr.bdr_connections then started the DB
+-- up.
+--
+-- Time to upgrade to dynconf. Hope this works!
+--
+
+-- First the extension must be updated on BOTH nodes
+\c upgrade_sim_0800_a
+ALTER EXTENSION bdr UPDATE;
+\c upgrade_sim_0800_b
+ALTER EXTENSION bdr UPDATE;
+
+
+-- then one must be upgraded standalone. For this one we'll provide no local
+-- dsn; it must be inferred from the node dsn in that case. There's also no
+-- remote DSN since it's the first node.
+\c upgrade_sim_0800_a
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_a', NULL, NULL);
+
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+ AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+
+-- Upgrade the second node using the first node. This time we'll
+-- supply a local dsn too, though it'll be the same.
+\c upgrade_sim_0800_b
+
+-- must have old nodes, no replication can have occurred
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+ORDER BY datname;
+
+SELECT bdr.bdr_upgrade_to_090('dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_b', 'dbname=upgrade_sim_0800_a');
+
+-- local node must be updated. Remote node could be either as replication
+-- might or might not have sent it yet.
+SELECT node_timeline, datname, node_status, node_local_dsn, node_init_from_dsn
+FROM bdr.bdr_nodes n INNER JOIN pg_database d ON (n.node_dboid = d.oid)
+WHERE datname = current_database()
+ORDER BY datname;
+
+SELECT * FROM pg_catalog.pg_shseclabel
+WHERE classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_database')
+ AND objoid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+-- TODO: wait for remote apply, switch back
+
+-- TODO: use test table
+
+-- TODO: lots of failure cases