bug 296 - part 1 FAILOVER fixes
authorSteve Singer <ssinger@ca.afilias.info>
Wed, 26 Jun 2013 15:31:19 +0000 (11:31 -0400)
committerSteve Singer <ssinger@ca.afilias.info>
Wed, 26 Jun 2013 15:31:19 +0000 (11:31 -0400)
This change allows a non-origin node to be specified on the FAILOVER command
so that subscriptions for these nodes are not redirected to the backup node.

An array of all the failed nodes (in addition to the one being failed over at this
stage of the process) is now passed to many of the failover stored functions

This commit also adds the SiteFailover unit test that tests multiple nodes in a site
failing over when the cluster does not have a complete path network

clustertest/disorder/tests/SiteFailover.js [new file with mode: 0644]
src/backend/slony1_funcs.sql
src/slon/remote_worker.c
src/slonik/slonik.c

diff --git a/clustertest/disorder/tests/SiteFailover.js b/clustertest/disorder/tests/SiteFailover.js
new file mode 100644 (file)
index 0000000..0ca9740
--- /dev/null
@@ -0,0 +1,198 @@
+
+coordinator.includeFile('disorder/tests/FailNodeTest.js');
+
+SiteFailover = function(coordinator, testResults) {
+       Failover.call(this, coordinator, testResults);
+       this.testDescription='Test the FAILOVER command.  This test will try FAILOVER'
+               +' with an entire site failing';
+       this.compareQueryList.push(['select i_id,comments from disorder.do_item_review order by i_id','i_id']);
+
+}
+SiteFailover.prototype = new Failover();
+SiteFailover.prototype.constructor = SiteFailover;
+
+SiteFailover.prototype.getNodeCount = function() {
+       return 6;
+}
+
+SiteFailover.prototype.runTest = function() {
+    this.coordinator.log("SiteFailover.prototype.runTest - begin");
+    this.testResults.newGroup("Multinode Fail Over Test");
+    basicTest.prepareDb(['db6']);
+    this.setupReplication();
+    
+    /**
+     * 
+     * set 1
+     *
+     *    1 --------> 2
+     *   /\          /\ 
+     *  3  4        5  6
+     *
+     * set 2 
+     *   3-----------> 5
+     *  /\            /\
+     * 1  4          2  6
+     *
+     * 
+     *  3/2 has a path because 3 is the local failover target for node 1
+     *  
+     *  There is a path between 1/5 because 1 is the failover target for set 2 if node 3 fails.
+     *
+     *  There is no path between 4/6 , or 1/6 or even 3/6.
+     *  
+     */
+    
+    /**
+     * Start the slons.
+     */
+    this.slonArray = [];
+    for ( var idx = 1; idx <= this.getNodeCount(); idx++) {
+       this.slonArray[idx - 1] = this.coordinator.createSlonLauncher('db' + idx);
+       this.slonArray[idx - 1].run();
+    }
+    var slonikScript='';
+    /**
+     * add in paths between nodes  1-->4 and 2-->5
+     * also add in node 6
+     */
+    slonikScript += 'store path(server=1,client=4,conninfo=@CONNINFO1 );\n';
+    slonikScript += 'store path(server=4,client=1,conninfo=@CONNINFO4 );\n';
+    slonikScript += 'store path(server=2,client=5,conninfo=@CONNINFO2 );\n';
+    slonikScript += 'store path(server=5,client=2,conninfo=@CONNINFO5 );\n';
+
+   
+    slonikScript += 'store path(server=2,client=6,conninfo=@CONNINFO2 );\n';
+    slonikScript += 'store path(server=6,client=2,conninfo=@CONNINFO6 );\n';
+    slonikScript += 'store path(server=5,client=6,conninfo=@CONNINFO5 );\n';
+    slonikScript += 'store path(server=6,client=5,conninfo=@CONNINFO6 );\n';
+    var slonikPreamble = this.getSlonikPreamble();
+    var slonik=this.coordinator.createSlonik('failover',slonikPreamble,slonikScript);
+    slonik.run();
+    this.coordinator.join(slonik);
+    this.testResults.assertCheck('add extra paths passes',slonik.getReturnCode(),0);
+    
+    /**
+     * Add some tables to replication.
+     * 
+     */
+    this.addTables();
+
+
+    /**
+     * create the second set.
+     */
+    this.createSecondSet(3);
+
+    /**
+     * Subscribe the first set.
+     */
+    this.subscribeSet(1,1, 1, [ 2, 3 ,4]);
+    this.subscribeSet(1,1, 2, [ 5,6 ]);
+    /**
+     * subscribe the second set.     
+     */
+    this.subscribeSet(2,3,3,[1,4,5]);
+    this.subscribeSet(2,3,5,[2,6]);
+    this.slonikSync(1,1);
+    var load = this.generateLoad(1);
+    java.lang.Thread.sleep(10*1000);
+    this.slonikSync(1,1);
+    this.populateReviewTable(2);
+    /**
+     * make sure the _review data makes it to
+     * all slaves, then let some SYNC events get
+     * genereated.  Next we FAILOVER.
+     */
+    this.slonikSync(2,2);
+    java.lang.Thread.sleep(10*1000);
+    this.failover(1,2,3,5,4);          
+    load.stop();
+    this.coordinator.join(load);
+
+       /**
+        * Now drop the nodes
+        */
+       var slonikPreamble = this.getSlonikPreamble();
+       var slonikScript = 'echo \'SiteFailover.drop nodes\';\n';
+       slonikScript+= 'drop node(id=\'1,3,4\',event node = 2);\nuninstall node(id=1);\nuninstall node(id=3);\uninstall node(id=3);\n';
+
+       var slonik=this.coordinator.createSlonik('drop node',slonikPreamble,slonikScript);
+       slonik.run();
+       this.coordinator.join(slonik);
+       this.testResults.assertCheck('drop 3 nodes passes',slonik.getReturnCode(),0);   
+    
+       load = this.generateLoad(2);
+    java.lang.Thread.sleep(10*1000);
+    this.slonikSync(1,2);
+    this.slonikSync(2,5);
+    
+
+    for ( var idx = 1; idx <= this.getNodeCount(); idx++) {
+       this.slonArray[idx - 1].stop();
+       this.coordinator.join(this.slonArray[idx - 1]);
+    }          
+    this.compareDb('db2','db5');
+    this.compareDb('db2','db6');
+    this.dropDb(['db6']);
+    
+}
+
+SiteFailover.prototype.failover=function(originA,backupA,originB,backupB,receiverC) 
+{
+    var slonikPreamble = 'cluster name=disorder_replica;\n';
+    var idx=2
+    slonikPreamble += 'node ' + idx + ' admin conninfo=\'dbname=$database.db'
+       + idx + '.dbname host=$database.db' + idx
+       + '.host  '
+       + ' port=$database.db' + idx + '.port'
+       +' user=$database.db' + idx                             
+       + '.user.slony password=$database.db' + idx + '.password.slony\';\n';
+    
+    var idx=5
+    slonikPreamble += 'node ' + idx + ' admin conninfo=\'dbname=$database.db'
+       + idx + '.dbname host=$database.db' + idx
+       + '.host  '
+       + ' port=$database.db' + idx + '.port'
+       +' user=$database.db' + idx                             
+       + '.user.slony password=$database.db' + idx + '.password.slony\';\n';
+    
+    var idx=6
+    slonikPreamble += 'node ' + idx + ' admin conninfo=\'dbname=$database.db'
+       + idx + '.dbname host=$database.db' + idx
+       + '.host  '
+       + ' port=$database.db' + idx + '.port'
+       +' user=$database.db' + idx                             
+       + '.user.slony password=$database.db' + idx + '.password.slony\';\n';
+    
+
+       /**
+        * first try failing over when we don't mention node 4.
+        * This should fail.
+        */
+    var slonikScript = 'echo \'SiteFailover.prototype.failover\';\n';
+       slonikScript += 'FAILOVER( node=(id='  + originA  + ',backup node=' + backupA +')'
+       + ', node=(id=' + originB + ',backup node=' + backupB + '));\n';
+       var slonik=this.coordinator.createSlonik('failover',slonikPreamble,slonikScript);
+       slonik.run();
+       this.coordinator.join(slonik);
+       this.testResults.assertCheck('failover (should not work)',slonik.getReturnCode()==0,false);     
+       
+
+       /**
+        * try again, everything should work
+        */ 
+
+       slonikScript = 'echo \'SiteFailover.prototype.failover\';\n';
+       slonikScript += 'FAILOVER( node=(id='  + originA  + ',backup node=' + backupA +')'
+       + ', node=(id=' + originB + ',backup node=' + backupB 
+               + '), node=(id='+receiverC + ', backup node=' + backupA + ') );\n';
+       var slonik=this.coordinator.createSlonik('failover',slonikPreamble,slonikScript);
+       slonik.run();
+       this.coordinator.join(slonik);
+       this.testResults.assertCheck('failover should work',slonik.getReturnCode(),0);
+
+       
+}
+
+
index a89f1ab91033bde0e3149380c4c9bbe03d483c77..d495d9f7c6b72378500a0293dc8f51e37e3f1dca 100644 (file)
@@ -1191,7 +1191,7 @@ and then restart of all node daemons.';
 --     Initiate a failover. This function must be called on all nodes
 --     and then waited for the restart of all node daemons.
 -- ----------------------------------------------------------------------
-create or replace function @NAMESPACE@.failedNode(p_failed_node int4, p_backup_node int4)
+create or replace function @NAMESPACE@.failedNode(p_failed_node int4, p_backup_node int4,p_failed_nodes integer[])
 returns int4
 as $$
 declare
@@ -1207,7 +1207,8 @@ begin
        update @NAMESPACE@.sl_subscribe set 
                   sub_provider=p_backup_node
                   where sub_provider=p_failed_node
-                  and sub_receiver<>p_backup_node;
+                  and sub_receiver<>p_backup_node
+                  and sub_receiver <> ALL (p_failed_nodes);
 
        -- ----
        -- Terminate all connections of the failed node the hard way
@@ -1226,7 +1227,7 @@ begin
 
         if not v_failed then
                
-               update @NAMESPACE@.sl_node set no_failed=true where no_id=p_failed_node
+               update @NAMESPACE@.sl_node set no_failed=true where no_id = ANY (p_failed_nodes)
                and no_failed=false;
           -- Rewrite sl_listen table
           perform @NAMESPACE@.RebuildListenEntries();     
@@ -1243,7 +1244,7 @@ begin
        return p_failed_node;
 end;
 $$ language plpgsql;
-comment on function @NAMESPACE@.failedNode(p_failed_node int4, p_backup_node int4) is
+comment on function @NAMESPACE@.failedNode(p_failed_node int4, p_backup_node int4,p_failed_nodes integer[]) is
 'Initiate failover from failed_node to backup_node.  This function must be called on all nodes, 
 and then waited for the restart of all node daemons.';
 
@@ -1255,7 +1256,7 @@ and then waited for the restart of all node daemons.';
 --     On the node that has the highest sequence number of the failed node,
 --     fake the FAILED_NODE event.
 -- ----------------------------------------------------------------------
-create or replace function @NAMESPACE@.failedNode2 (p_failed_node int4, p_backup_node int4, p_ev_seqno int8)
+create or replace function @NAMESPACE@.failedNode2 (p_failed_node int4, p_backup_node int4, p_ev_seqno int8, p_failed_nodes integer[])
 returns bigint
 as $$
 declare
@@ -1271,8 +1272,8 @@ begin
                                p_failed_node, p_ev_seqno;
        end if;
 
-       update @NAMESPACE@.sl_node set no_failed=true  where no_id=p_failed_node
-               and no_failed=false;
+       update @NAMESPACE@.sl_node set no_failed=true  where no_id = ANY 
+       (p_failed_nodes) and no_failed=false;
        -- Rewrite sl_listen table
        perform @NAMESPACE@.RebuildListenEntries();
        -- ----
@@ -1283,7 +1284,8 @@ begin
        notify "_@CLUSTERNAME@_Restart";
 
        select @NAMESPACE@.createEvent('_@CLUSTERNAME@','FAILOVER_NODE',
-                                                               p_failed_node::text,p_ev_seqno::text)
+                                                               p_failed_node::text,p_ev_seqno::text,
+                                                               array_to_string(p_failed_nodes,','))
                        into v_new_event;
                
 
@@ -1291,8 +1293,8 @@ begin
 end;
 $$ language plpgsql;
 
-comment on function @NAMESPACE@.failedNode2 (p_failed_node int4, p_backup_node int4, p_ev_seqno int8) is
-'FUNCTION failedNode2 (failed_node, backup_node, set_id, ev_seqno, ev_seqfake)
+comment on function @NAMESPACE@.failedNode2 (p_failed_node int4, p_backup_node int4, p_ev_seqno int8,p_failed_nodes integer[] ) is
+'FUNCTION failedNode2 (failed_node, backup_node, set_id, ev_seqno, ev_seqfake,p_failed_nodes)
 
 On the node that has the highest sequence number of the failed node,
 fake the FAILOVER_SET event.';
@@ -1323,6 +1325,7 @@ declare
        v_row                           record;
        v_last_sync                     int8;
        v_set                           int4;
+       v_missing_path      int4;
 begin
        SELECT max(ev_seqno) into v_last_sync FROM @NAMESPACE@.sl_event where
                   ev_origin=p_failed_node;
@@ -1358,10 +1361,14 @@ begin
                                        and sub_receiver = p_backup_node;
                        update @NAMESPACE@.sl_set
                                set set_origin = p_backup_node
-                               where set_id = v_set;
+                               where set_id = v_set;           
                         update @NAMESPACE@.sl_subscribe
-                                               set sub_provider=p_backup_node
-                                          where sub_set = v_set;
+                                               set sub_provider=p_backup_node
+                                               FROM @NAMESPACE@.sl_node receive_node
+                                          where sub_set = v_set
+                                          and sub_provider=p_failed_node
+                                          and sub_receiver=receive_node.no_id
+                                          and receive_node.no_failed=false;
 
                        for v_row in select * from @NAMESPACE@.sl_table
                                where tab_set = v_set
@@ -1376,9 +1383,15 @@ begin
                        delete from @NAMESPACE@.sl_subscribe
                                          where sub_set = v_set
                                          and sub_receiver = p_backup_node;
+               
                        update @NAMESPACE@.sl_subscribe
                                                set sub_provider=p_backup_node
-                                          where sub_set = v_set;
+                                               FROM @NAMESPACE@.sl_node receive_node
+                                          where sub_set = v_set
+                                           and sub_provider=p_failed_node
+                                                and sub_provider=p_failed_node
+                                          and sub_receiver=receive_node.no_id
+                                          and receive_node.no_failed=false;
                        update @NAMESPACE@.sl_set
                                           set set_origin = p_backup_node
                                where set_id = v_set;
@@ -4700,7 +4713,8 @@ begin
 
        -- Loop over every possible pair of receiver and event origin
        for v_row in select N1.no_id as receiver, N2.no_id as origin,
-                         N2.no_failed as failed
+                         N2.no_failed as origin_failed,
+                         N1.no_failed as receiver_failed
                        from @NAMESPACE@.sl_node as N1, @NAMESPACE@.sl_node as N2
                        where N1.no_id <> N2.no_id
        loop
@@ -4749,7 +4763,7 @@ begin
                                end if;
                end if;
 
-               if v_row.failed then
+               if v_row.origin_failed then
                
                --for every failed node we delete all sl_listen entries
                --except via providers (listed in sl_subscribe).
index e3c71a51de055cddfd58383d0ba2cfe3fab261b0..0196e16f627d2a3d4916359043605ef22e561412 100644 (file)
@@ -1189,6 +1189,8 @@ remoteWorkerThread_main(void *cdata)
                        {
                                int                     failed_node = (int) strtol(event->ev_data1, NULL, 10);
                                char       *seq_no_c = event->ev_data2;
+                               char       *failed_node_list  = event->ev_data3;
+
                                PGresult   *res;
 
                                /**
@@ -1200,9 +1202,9 @@ remoteWorkerThread_main(void *cdata)
                                 * The most-ahead failover canidate is the node that
                                 * created the FAILOVER_NODE event (node->id)
                                 */
-                               slon_mkquery(&query2, "select %s.failedNode(%d,%d);"
+                               slon_mkquery(&query2, "select %s.failedNode(%d,%d,ARRAY[%s]);"
                                                         ,rtcfg_namespace,
-                                                        failed_node, node->no_id);
+                                                        failed_node, node->no_id,failed_node_list);
 
                                res = PQexec(local_dbconn, dstring_data(&query2));
                                if (PQresultStatus(res) != PGRES_TUPLES_OK)
index 3000dcbab6944da3e560c07a177941101c490f2e..bc34fd2ec595eba4e236aca7bf0e5e01f7701536 100644 (file)
@@ -170,7 +170,8 @@ static int
 fail_node_promote(SlonikStmt_failed_node * stmt,
                                  failed_node_entry * node_entry,
                                  failnode_node * nodeinfo
-                                 ,int *fail_node_ids);
+                                 ,int *fail_node_ids,
+                                 SlonDString * failed_node_list);
 
 /* ----------
  * main
@@ -2710,6 +2711,7 @@ slonik_drop_node(SlonikStmt_drop_node * stmt)
                                        }
 
                                }
+#if 0 
                                if (slonik_is_slony_installed((SlonikStmt *) stmt, curAdmInfo) > 0)
                                {
                                        rc = slonik_wait_config_caughtup(curAdmInfo, (SlonikStmt *) stmt,
@@ -2717,6 +2719,7 @@ slonik_drop_node(SlonikStmt_drop_node * stmt)
                                        if (rc < 0)
                                                return rc;
                                }
+#endif
                        }
 
                }
@@ -2780,20 +2783,21 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
        failnode_node **max_node_total = NULL;
        failed_node_entry *node_entry = stmt->nodes;
        int                *fail_node_ids = NULL;
-
+       bool         missing_paths=false;
        int                     rc = 0;
 
 
        /**
         *      The failover procedure (at a high level) is as follows
         *
-        * 1. Get a list of failover candidates for each failed node.
-        * 2. validate that we have conninfo to all of them
-        * 3. blank their paths to the failed nodes
-        * 4. Wait for slons to restart
-        * 5. for each failed node get the highest xid for each candidate
-        * 6. execute FAILOVER on the highest canidate
-        * 7. MOVE SET to the backup node.
+        * 1. Check to see if the failover will leave orphan nodes
+        * 2. Get a list of failover candidates for each failed node.
+        * 3. validate that we have conninfo to all of them
+        * 4. blank their paths to the failed nodes
+        * 5. Wait for slons to restart
+        * 6. for each failed node get the highest xid for each candidate
+        * 7. execute FAILOVER on the highest canidate
+        * 8. MOVE SET to the backup node.
         */
 
 
@@ -2828,6 +2832,68 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
        memset(set_list, 0, sizeof(int *) * num_origins);
 
 
+       /**
+        * validate that the list of failed nodes is complete.
+        * This means that for any providers in sl_subscribe 
+        * that have failed there muts be a path between
+        * the receiver node and the specified backup node.
+        *
+        * If this isn't the case then the user must also include
+        * the receiver node as a failed node, or add in the path
+        * before calling FAILOVER.
+        *
+        * We only check sl_subscribe on arbitrary backup node
+        */
+       for (node_entry = stmt->nodes; node_entry != NULL;
+                node_entry = node_entry->next, cur_origin_idx++)
+       {
+               int num_orphans=0;
+               int res_idx;
+               slon_mkquery(&query,
+                                        " select sub_receiver FROM \"_%s\".sl_subscribe left join "
+                                        "\"_%s\".sl_path on (pa_server="
+                                        " %d and pa_client=sub_receiver) where sub_receiver not "
+                                        "in (%s) and sub_provider=%d and sub_receiver <> %d and pa_client is null"
+                                        , stmt->hdr.script->clustername
+                                        , stmt->hdr.script->clustername
+                                        , node_entry->backup_node
+                                        ,  dstring_data(&failed_node_list)
+                                        ,  node_entry->no_id,node_entry->backup_node);
+               adminfo1 = get_active_adminfo((SlonikStmt *) stmt, node_entry->backup_node);
+               if (adminfo1 == NULL)
+               {
+                       printf("%s:%d no admin conninfo for node %d\n",
+                                  stmt->hdr.stmt_filename, stmt->hdr.stmt_lno,
+                                  node_entry->backup_node);
+                       rc = -1;
+                       goto cleanup;
+               }                        
+               res1 = db_exec_select((SlonikStmt *) stmt, adminfo1, &query);
+               if (res1 == NULL)
+               {
+                       rc = -1;
+                       goto cleanup;
+               }
+               num_orphans = PQntuples(res1);
+               for(res_idx=0; res_idx < num_orphans; res_idx++)
+               {
+                       char * receiver=PQgetvalue(res1,res_idx,0);
+                       printf("%s:%d can't failover node %s does not have a path to %d\n",
+                                  stmt->hdr.stmt_filename, stmt->hdr.stmt_lno,receiver,
+                                  node_entry->backup_node);
+                       missing_paths=true;
+                                  
+
+               }
+               PQclear(res1);
+       }
+       if ( missing_paths )
+       {
+               rc=-1;
+               goto cleanup;
+
+       }
+
 
        /**
         * get the list of failover candidates for each of the
@@ -2935,9 +3001,10 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
                 * Connect to all these nodes and determine if there is a node daemon
                 * running on that node.
                 */
-               has_candidate = true;
+
                for (i = 0; i < node_entry->num_nodes; i++)
                {
+                       has_candidate = true;
                        const char * pidcolumn;
                        nodeinfo[i].no_id = (int) strtol(PQgetvalue(res1, i, 0), NULL, 10);
                        nodeinfo[i].adminfo = get_active_adminfo((SlonikStmt *) stmt,
@@ -2995,15 +3062,15 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
                }
                PQclear(res1);
                PQclear(res2);
-               if (!has_candidate)
+               if (!has_candidate && node_entry->num_sets > 0 )
                {
                        printf("%s:%d error no failover candidates for %d\n",
                                   stmt->hdr.stmt_filename, stmt->hdr.stmt_lno
                                   ,node_entry->no_id);
-                       PQclear(res1);
                        rc = -1;
                        goto cleanup;
                }
+       
 
                /*
                 * Execute the preFailover() procedure on all failover candidate nodes
@@ -3063,10 +3130,14 @@ slonik_failed_node(SlonikStmt_failed_node * stmt)
                 node_entry = node_entry->next, cur_origin_idx++)
        {
 
+               
                failnode_node *nodeinfo = (failnode_node *) failnodebuf[cur_origin_idx];
+               
+               if ( node_entry->num_nodes == 0 )
+                       continue;
 
                rc = fail_node_promote(stmt, node_entry, nodeinfo,
-                                                          fail_node_ids);
+                                                          fail_node_ids,&failed_node_list);
                if (rc < 0)
                {
                        goto cleanup;
@@ -3235,7 +3306,8 @@ int
 fail_node_promote(SlonikStmt_failed_node * stmt,
                                  failed_node_entry * node_entry,
                                  failnode_node * nodeinfo,
-                                 int *fail_node_ids)
+                                 int *fail_node_ids,
+                     SlonDString * failed_node_list)
 {
        int64           max_seqno = 0;
        int                     max_node_idx = 0;
@@ -3250,6 +3322,16 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
 
        dstring_init(&query);
 
+       if ( node_entry->num_nodes == 0 && node_entry->num_sets  == 0 ) 
+       {
+               /**
+                * This node is the origin of no sets but we still need to
+                * let the rest of the cluster know that this node is considered
+                * failed.
+                */
+
+       }
+
        /*
         * For every node determine the one with the event , preferring the backup
         * node.
@@ -3295,18 +3377,19 @@ fail_node_promote(SlonikStmt_failed_node * stmt,
        }
        adminfo1 = nodeinfo[max_node_idx].adminfo;
 
+
        /*
         * Now execute all FAILED_NODE events on the most ahead candidate
         */
        sprintf(ev_seqno_c, INT64_FORMAT, max_seqno);
        slon_mkquery(&query,
                                 "lock table \"_%s\".sl_event_lock, \"_%s\".sl_config_lock;"
-                                "select \"_%s\".failedNode2(%d,%d,'%s'); ",
+                                "select \"_%s\".failedNode2(%d,%d,'%s',ARRAY[%s]); ",
                                 stmt->hdr.script->clustername,
                                 stmt->hdr.script->clustername,
                                 stmt->hdr.script->clustername,
                                 node_entry->no_id, nodeinfo[max_node_idx].no_id
-                                ,ev_seqno_c);
+                                ,ev_seqno_c,dstring_data(failed_node_list));
        printf("NOTICE: executing \"_%s\".failedNode2 on node %d\n",
                   stmt->hdr.script->clustername,
                   nodeinfo[max_node_idx].no_id);