From: Steve Singer Date: Thu, 25 Jul 2013 19:37:17 +0000 (-0400) Subject: add in a check/test to ensure that the backup node in a failover is a forwarding... X-Git-Tag: REL_2_2_0_B5~11 X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=60f0c17855831f3b57601f2fdb24e814db3aa366;p=slony1-engine.git add in a check/test to ensure that the backup node in a failover is a forwarding subscriber If the backup node is not a forwarding subscriber to all of the sets from the origin then the failover can't complete. It is best to check this early on and warn the user before the failover actually starts. --- diff --git a/clustertest/disorder/tests/Failover.js b/clustertest/disorder/tests/Failover.js index cbef966c..2dd0ea1e 100644 --- a/clustertest/disorder/tests/Failover.js +++ b/clustertest/disorder/tests/Failover.js @@ -40,9 +40,24 @@ Failover.prototype.runTest = function() { * Subscribe the first node. */ this.subscribeSet(1,1, 1, [ 2, 3 ]); + + + /** + * try failing over node 1 to node 4 first. + * node 4 is NOT subscribed to the sets of node + * 1 so this had better fail. + */ + this.slonikSync(1,1); + this.failNode(1,4,false); + this.slonArray[1 - 1] = this.coordinator.createSlonLauncher('db' + 1); + this.slonArray[1 - 1].run(); + + this.subscribeSet(1,1, 3, [ 4, 5 ]); this.slonikSync(1,1); + + var load = this.generateLoad(); @@ -55,7 +70,7 @@ Failover.prototype.runTest = function() { var lag1 = this.measureLag(1,5); java.lang.Thread.sleep(10*1000); - var lag2 = this.measureLag(1,5); + var lag2 = this.measureLag(1,5); this.testResults.assertCheck('lag on node 5 is increasing',lag2 > lag1 ,true); @@ -97,6 +112,18 @@ Failover.prototype.runTest = function() { this.addCompletePaths(); this.moveSet(1,3,1); + /** + * make sure we perform a SYNC after the move set + * but before we start failing nodes. The listen network + * is different after the move set and we want to make sure + * all nodes have the new listen network. + * Node 5 might have a 1,SYNC event more recent + * than node 3 because prior to the MOVE SET + * the listen network allowed for this. + * The FAILOVER logic can't deal with 5 being + * the most-ahead node since it isn't a direct subscriber. + */ + this.slonikSync(1,1); load = this.generateLoad(); @@ -157,7 +184,7 @@ Failover.prototype.runTest = function() { this.compareDb('db1', 'db4'); this.addCompletePaths(); this.moveSet(1,3,1) - + this.slonikSync(1,1); /** * Now shutdown the slon for node 3, see how a failover to node 3 behaves. */ @@ -207,6 +234,7 @@ Failover.prototype.runTest = function() { this.addCompletePaths(); this.moveSet(1,3,1); + this.slonikSync(1,1); load = this.generateLoad(); this.coordinator.log('stopping load'); java.lang.Thread.sleep(30*1000); @@ -327,7 +355,6 @@ Failover.prototype.failNode=function(node_id,backup_id, expect_success) { this.coordinator.join(slonik); this.testResults.assertCheck('slonik failover status okay',slonik.getReturnCode()==0,expect_success); - this.coordinator.log('subscribe list is now'); rs = stat.executeQuery("SELECT * FROM _disorder_replica.sl_set"); while(rs.next()) { diff --git a/src/slonik/slonik.c b/src/slonik/slonik.c index d2b47ba0..429a8de4 100644 --- a/src/slonik/slonik.c +++ b/src/slonik/slonik.c @@ -2846,7 +2846,7 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) /** * validate that the list of failed nodes is complete. * This means that for any providers in sl_subscribe - * that have failed there muts be a path between + * that have failed must have a path between * the receiver node and the specified backup node. * * If this isn't the case then the user must also include @@ -2897,6 +2897,39 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) } PQclear(res1); + + /** + * make sure that the backup_node is a forwarding subscriber of ALL + * sets the the failed node is an origin for. + */ + slon_mkquery(&query,"select set.set_id FROM \"_%s\".sl_set set " \ + " left outer join \"_%s\".sl_subscribe sub on (" \ + " set.set_id=sub.sub_set and sub.sub_receiver=%d " \ + " and sub.sub_forward=true) " \ + " where sub.sub_set is null and set.set_origin=%d " + , stmt->hdr.script->clustername + , stmt->hdr.script->clustername + , node_entry->backup_node + , node_entry->no_id); + + res1 = db_exec_select((SlonikStmt *) stmt, adminfo1, &query); + if (res1 == NULL) + { + rc = -1; + goto cleanup; + } + if(PQntuples(res1)!=0) + { + printf("%s:%d node %d is not a forwarding subscriber of set %s " \ + "from node %d \n", + stmt->hdr.stmt_filename, stmt->hdr.stmt_lno, + node_entry->backup_node,PQgetvalue(res1,0,0), + node_entry->no_id); + missing_paths=true; + } + PQclear(res1); + + } if ( missing_paths ) { @@ -2926,6 +2959,9 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) rc = -1; goto cleanup; } + + + if (db_begin_xact((SlonikStmt *) stmt, adminfo1, false) < 0) { printf("%s:%d can not connect to node %d\n", @@ -2937,6 +2973,10 @@ slonik_failed_node(SlonikStmt_failed_node * stmt) fail_node_ids[cur_origin_idx] = node_entry->no_id; + + + + /* * On the backup node select a list of all failover candidate nodes * except for the failed nodes.