* Subscribe the first node.
*/
this.subscribeSet(1,1, 1, [ 2, 3 ]);
+
+
+ /**
+ * try failing over node 1 to node 4 first.
+ * node 4 is NOT subscribed to the sets of node
+ * 1 so this had better fail.
+ */
+ this.slonikSync(1,1);
+ this.failNode(1,4,false);
+ this.slonArray[1 - 1] = this.coordinator.createSlonLauncher('db' + 1);
+ this.slonArray[1 - 1].run();
+
+
this.subscribeSet(1,1, 3, [ 4, 5 ]);
this.slonikSync(1,1);
+
+
var load = this.generateLoad();
var lag1 = this.measureLag(1,5);
java.lang.Thread.sleep(10*1000);
- var lag2 = this.measureLag(1,5);
+ var lag2 = this.measureLag(1,5);
this.testResults.assertCheck('lag on node 5 is increasing',lag2 > lag1 ,true);
this.addCompletePaths();
this.moveSet(1,3,1);
+ /**
+ * make sure we perform a SYNC after the move set
+ * but before we start failing nodes. The listen network
+ * is different after the move set and we want to make sure
+ * all nodes have the new listen network.
+ * Node 5 might have a 1,SYNC event more recent
+ * than node 3 because prior to the MOVE SET
+ * the listen network allowed for this.
+ * The FAILOVER logic can't deal with 5 being
+ * the most-ahead node since it isn't a direct subscriber.
+ */
+ this.slonikSync(1,1);
load = this.generateLoad();
this.compareDb('db1', 'db4');
this.addCompletePaths();
this.moveSet(1,3,1)
-
+ this.slonikSync(1,1);
/**
* Now shutdown the slon for node 3, see how a failover to node 3 behaves.
*/
this.addCompletePaths();
this.moveSet(1,3,1);
+ this.slonikSync(1,1);
load = this.generateLoad();
this.coordinator.log('stopping load');
java.lang.Thread.sleep(30*1000);
this.coordinator.join(slonik);
this.testResults.assertCheck('slonik failover status okay',slonik.getReturnCode()==0,expect_success);
-
this.coordinator.log('subscribe list is now');
rs = stat.executeQuery("SELECT * FROM _disorder_replica.sl_set");
while(rs.next()) {
/**
* validate that the list of failed nodes is complete.
* This means that for any providers in sl_subscribe
- * that have failed there muts be a path between
+ * that have failed must have a path between
* the receiver node and the specified backup node.
*
* If this isn't the case then the user must also include
}
PQclear(res1);
+
+ /**
+ * make sure that the backup_node is a forwarding subscriber of ALL
+ * sets the the failed node is an origin for.
+ */
+ slon_mkquery(&query,"select set.set_id FROM \"_%s\".sl_set set " \
+ " left outer join \"_%s\".sl_subscribe sub on (" \
+ " set.set_id=sub.sub_set and sub.sub_receiver=%d " \
+ " and sub.sub_forward=true) " \
+ " where sub.sub_set is null and set.set_origin=%d "
+ , stmt->hdr.script->clustername
+ , stmt->hdr.script->clustername
+ , node_entry->backup_node
+ , node_entry->no_id);
+
+ res1 = db_exec_select((SlonikStmt *) stmt, adminfo1, &query);
+ if (res1 == NULL)
+ {
+ rc = -1;
+ goto cleanup;
+ }
+ if(PQntuples(res1)!=0)
+ {
+ printf("%s:%d node %d is not a forwarding subscriber of set %s " \
+ "from node %d \n",
+ stmt->hdr.stmt_filename, stmt->hdr.stmt_lno,
+ node_entry->backup_node,PQgetvalue(res1,0,0),
+ node_entry->no_id);
+ missing_paths=true;
+ }
+ PQclear(res1);
+
+
}
if ( missing_paths )
{
rc = -1;
goto cleanup;
}
+
+
+
if (db_begin_xact((SlonikStmt *) stmt, adminfo1, false) < 0)
{
printf("%s:%d can not connect to node %d\n",
fail_node_ids[cur_origin_idx] = node_entry->no_id;
+
+
+
+
/*
* On the backup node select a list of all failover candidate nodes
* except for the failed nodes.