add in a check/test to ensure that the backup node in a failover is a forwarding...
authorSteve Singer <ssinger@ca.afilias.info>
Thu, 25 Jul 2013 19:37:17 +0000 (15:37 -0400)
committerSteve Singer <ssinger@ca.afilias.info>
Fri, 26 Jul 2013 14:22:42 +0000 (10:22 -0400)
If the backup node is not a forwarding subscriber to all of the sets from the origin
then the failover can't complete.  It is best to check this early on and warn the user
before the failover actually starts.

clustertest/disorder/tests/Failover.js
src/slonik/slonik.c

index cbef966c59c5435cba0cf50f4d484a825f60582b..2dd0ea1e39e89f6cb676220d70214bf5b9f6459a 100644 (file)
@@ -40,9 +40,24 @@ Failover.prototype.runTest = function() {
         * Subscribe the first node.
         */
        this.subscribeSet(1,1, 1, [ 2, 3 ]);
+
+
+       /**
+        * try failing over node 1 to node 4 first.
+        * node 4 is NOT subscribed to the sets of node
+        * 1 so this had better fail.
+        */
+       this.slonikSync(1,1);
+       this.failNode(1,4,false);
+       this.slonArray[1 - 1] = this.coordinator.createSlonLauncher('db' + 1);
+       this.slonArray[1 - 1].run();
+       
+       
        this.subscribeSet(1,1, 3, [ 4, 5 ]);
        this.slonikSync(1,1);
 
+       
+
        var load = this.generateLoad();
        
        
@@ -55,7 +70,7 @@ Failover.prototype.runTest = function() {
   
        var lag1 = this.measureLag(1,5);
        java.lang.Thread.sleep(10*1000);
-       var lag2 = this.measureLag(1,5);        
+       var lag2 = this.measureLag(1,5);
        this.testResults.assertCheck('lag on node 5 is increasing',lag2 > lag1 ,true);
                
        
@@ -97,6 +112,18 @@ Failover.prototype.runTest = function() {
        this.addCompletePaths();
        
        this.moveSet(1,3,1);
+       /**
+        * make sure we perform a SYNC after the move set
+        * but before we start failing nodes.  The listen network
+        * is different after the move set and we want to make sure
+        * all nodes have the new listen network.
+        * Node 5 might have a 1,SYNC event more recent
+        * than node 3 because prior to the MOVE SET
+        * the listen network allowed for this.
+        * The FAILOVER logic can't deal with 5 being
+        * the most-ahead node since it isn't a direct subscriber.
+        */
+       this.slonikSync(1,1);
        load = this.generateLoad();
        
        
@@ -157,7 +184,7 @@ Failover.prototype.runTest = function() {
        this.compareDb('db1', 'db4');
        this.addCompletePaths();
        this.moveSet(1,3,1)
-
+       this.slonikSync(1,1);
        /**
         * Now shutdown the slon for node 3, see how a failover to node 3 behaves.
         */
@@ -207,6 +234,7 @@ Failover.prototype.runTest = function() {
        
        this.addCompletePaths();        
        this.moveSet(1,3,1);
+       this.slonikSync(1,1);
        load = this.generateLoad();
        this.coordinator.log('stopping load');
        java.lang.Thread.sleep(30*1000);
@@ -327,7 +355,6 @@ Failover.prototype.failNode=function(node_id,backup_id, expect_success) {
        this.coordinator.join(slonik);  
        this.testResults.assertCheck('slonik failover status okay',slonik.getReturnCode()==0,expect_success);
        
-       
        this.coordinator.log('subscribe list is now');
        rs = stat.executeQuery("SELECT * FROM _disorder_replica.sl_set");
        while(rs.next()) {
index d2b47ba02e1a92beefd3e187af46ce084ae1e7ca..429a8de4c05bab7f9aee1db01582e7bcf7afafe3 100644 (file)
@@ -2846,7 +2846,7 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
        /**
         * validate that the list of failed nodes is complete.
         * This means that for any providers in sl_subscribe 
-        * that have failed there muts be a path between
+        * that have failed  must have  a path between
         * the receiver node and the specified backup node.
         *
         * If this isn't the case then the user must also include
@@ -2897,6 +2897,39 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
 
                }
                PQclear(res1);
+               
+               /**
+                * make sure that the backup_node is a forwarding subscriber of ALL
+                * sets the the failed node is an origin for.
+                */
+               slon_mkquery(&query,"select set.set_id FROM \"_%s\".sl_set set " \
+                                        " left outer join \"_%s\".sl_subscribe sub on (" \
+                                        "    set.set_id=sub.sub_set and sub.sub_receiver=%d " \
+                                        "    and sub.sub_forward=true) " \
+                                        " where sub.sub_set is null and set.set_origin=%d "
+                                        , stmt->hdr.script->clustername
+                                        , stmt->hdr.script->clustername
+                                        , node_entry->backup_node
+                                        , node_entry->no_id);
+               
+               res1 = db_exec_select((SlonikStmt *) stmt, adminfo1, &query);
+               if (res1 == NULL)
+               {
+                       rc = -1;
+                       goto cleanup;
+               }
+               if(PQntuples(res1)!=0) 
+               {
+                       printf("%s:%d node %d is not a forwarding subscriber of set %s " \
+                                  "from node %d \n",
+                                  stmt->hdr.stmt_filename, stmt->hdr.stmt_lno,
+                                  node_entry->backup_node,PQgetvalue(res1,0,0),
+                                  node_entry->no_id);
+                       missing_paths=true;
+               }
+               PQclear(res1);
+
+
        }
        if ( missing_paths )
        {
@@ -2926,6 +2959,9 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
                        rc = -1;
                        goto cleanup;
                }
+
+       
+
                if (db_begin_xact((SlonikStmt *) stmt, adminfo1, false) < 0)
                {
                        printf("%s:%d can not connect to node %d\n",
@@ -2937,6 +2973,10 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
 
                fail_node_ids[cur_origin_idx] = node_entry->no_id;
 
+
+       
+
+       
                /*
                 * On the backup node select a list of all failover candidate nodes
                 * except for the failed nodes.