only run "serverid wipe" if we are actually running samba.

[sahlberg/ctdb.git] / server / ctdb_recoverd.c
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c

index 6a453f98d8b05229806f0b37519fc6a581c45b26..9caa5024dd653e89bbabd97fc651e066a4f6ecfc 100644 (file)
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -18,14 +18,14 @@
  */
  
  #include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
  #include "system/filesys.h"
  #include "system/time.h"
  #include "system/network.h"
  #include "system/wait.h"
  #include "popt.h"
  #include "cmdline.h"
-#include "../include/ctdb.h"
+#include "../include/ctdb_client.h"
  #include "../include/ctdb_private.h"
  #include "db_wrap.h"
  #include "dlinklist.h"
@@ -64,6 +64,7 @@ struct ctdb_recoverd {
         TALLOC_CTX *ip_reallocate_ctx;
         struct ip_reallocate_list *reallocate_callers;
         TALLOC_CTX *ip_check_disable_ctx;
+       struct ctdb_control_get_ifaces *ifaces;
  };
  
  #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -211,7 +212,7 @@ static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_
  static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
  {
         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
-               DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
+               DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
                 return;
         }
         if (node_pnn < ctdb->num_nodes) {
@@ -521,7 +522,8 @@ static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb
    pull the remote database contents from one node into the recdb
   */
  static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode, 
-                                   struct tdb_wrap *recdb, uint32_t dbid)
+                                   struct tdb_wrap *recdb, uint32_t dbid,
+                                   bool persistent)
  {
         int ret;
         TDB_DATA outdata;
@@ -606,7 +608,8 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
  static int pull_remote_database(struct ctdb_context *ctdb,
                                 struct ctdb_recoverd *rec, 
                                 struct ctdb_node_map *nodemap, 
-                               struct tdb_wrap *recdb, uint32_t dbid)
+                               struct tdb_wrap *recdb, uint32_t dbid,
+                               bool persistent)
  {
         int j;
  
@@ -618,7 +621,7 @@ static int pull_remote_database(struct ctdb_context *ctdb,
                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
                         continue;
                 }
-               if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
+               if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid, persistent) != 0) {
                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n", 
                                  nodemap->nodes[j].pnn));
                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
@@ -884,10 +887,11 @@ static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
  /*
    wait for a given number of seconds
   */
-static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
  {
         uint32_t timed_out = 0;
-       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
+       time_t usecs = (secs - (time_t)secs) * 1000000;
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
         while (!timed_out) {
                 event_loop_once(ctdb->ev);
         }
@@ -901,6 +905,7 @@ static void ctdb_election_timeout(struct event_context *ev, struct timed_event *
  {
         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
         rec->election_timeout = NULL;
+       fast_start = false;
  
         DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
  }
@@ -1008,16 +1013,19 @@ static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_
         unsigned tdb_flags;
  
         /* open up the temporary recovery database */
-       name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
+       name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
+                              ctdb->db_directory_state,
+                              ctdb->pnn);
         if (name == NULL) {
                 return NULL;
         }
         unlink(name);
  
         tdb_flags = TDB_NOLOCK;
-       if (!ctdb->do_setsched) {
+       if (ctdb->valgrinding) {
                 tdb_flags |= TDB_NOMMAP;
         }
+       tdb_flags |= TDB_DISALLOW_NESTING;
  
         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size, 
                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
@@ -1039,6 +1047,7 @@ struct recdb_data {
         struct ctdb_marshall_buffer *recdata;
         uint32_t len;
         bool failed;
+       bool persistent;
  };
  
  static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
@@ -1054,7 +1063,9 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
  
         /* update the dmaster field to point to us */
         hdr = (struct ctdb_ltdb_header *)data.dptr;
-       hdr->dmaster = params->ctdb->pnn;
+       if (!params->persistent) {
+               hdr->dmaster = params->ctdb->pnn;
+       }
  
         /* add the record to the blob ready to send to the nodes */
         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
@@ -1081,6 +1092,7 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
    push the recdb database out to all nodes
   */
  static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
+                              bool persistent,
                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
  {
         struct recdb_data params;
@@ -1101,6 +1113,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
         params.recdata = recdata;
         params.len = offsetof(struct ctdb_marshall_buffer, data);
         params.failed = false;
+       params.persistent = persistent;
  
         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
@@ -1149,6 +1162,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
  static int recover_database(struct ctdb_recoverd *rec, 
                             TALLOC_CTX *mem_ctx,
                             uint32_t dbid,
+                           bool persistent,
                             uint32_t pnn, 
                             struct ctdb_node_map *nodemap,
                             uint32_t transaction_id)
@@ -1166,7 +1180,7 @@ static int recover_database(struct ctdb_recoverd *rec,
         }
  
         /* pull all remote databases onto the recdb */
-       ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
+       ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
                 return -1;
@@ -1194,7 +1208,7 @@ static int recover_database(struct ctdb_recoverd *rec,
         
         /* push out the correct database. This sets the dmaster and skips 
            the empty records */
-       ret = push_recdb_database(ctdb, dbid, recdb, nodemap);
+       ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
         if (ret != 0) {
                 talloc_free(recdb);
                 return -1;
@@ -1215,7 +1229,118 @@ static void reload_nodes_file(struct ctdb_context *ctdb)
         ctdb_load_nodes_file(ctdb);
  }
  
-       
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+                                        struct ctdb_recoverd *rec,
+                                        struct ctdb_node_map *nodemap,
+                                        uint32_t *culprit)
+{
+       int j;
+       int ret;
+
+       if (ctdb->num_nodes != nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
+                                 ctdb->num_nodes, nodemap->num));
+               if (culprit) {
+                       *culprit = ctdb->pnn;
+               }
+               return -1;
+       }
+
+       for (j=0; j<nodemap->num; j++) {
+               /* release any existing data */
+               if (ctdb->nodes[j]->known_public_ips) {
+                       talloc_free(ctdb->nodes[j]->known_public_ips);
+                       ctdb->nodes[j]->known_public_ips = NULL;
+               }
+               if (ctdb->nodes[j]->available_public_ips) {
+                       talloc_free(ctdb->nodes[j]->available_public_ips);
+                       ctdb->nodes[j]->available_public_ips = NULL;
+               }
+
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               /* grab a new shiny list of public ips from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       ctdb->nodes[j]->pnn,
+                                       ctdb->nodes,
+                                       0,
+                                       &ctdb->nodes[j]->known_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
+                               ctdb->nodes[j]->pnn));
+                       if (culprit) {
+                               *culprit = ctdb->nodes[j]->pnn;
+                       }
+                       return -1;
+               }
+
+               if (ctdb->tunable.disable_ip_failover == 0) {
+                       if (rec->ip_check_disable_ctx == NULL) {
+                               if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
+                                       DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
+                                       rec->need_takeover_run = true;
+                               }
+                       }
+               }
+
+               /* grab a new shiny list of public ips from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       ctdb->nodes[j]->pnn,
+                                       ctdb->nodes,
+                                       CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+                                       &ctdb->nodes[j]->available_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
+                               ctdb->nodes[j]->pnn));
+                       if (culprit) {
+                               *culprit = ctdb->nodes[j]->pnn;
+                       }
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* when we start a recovery, make sure all nodes use the same reclock file
+   setting
+*/
+static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       TDB_DATA data;
+       uint32_t *nodes;
+
+       if (ctdb->recovery_lock_file == NULL) {
+               data.dptr  = NULL;
+               data.dsize = 0;
+       } else {
+               data.dsize = strlen(ctdb->recovery_lock_file) + 1;
+               data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
+       }
+
+       nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
  /*
    we are the recmaster, and recovery is needed - start a recovery run
   */
@@ -1230,6 +1355,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
         TDB_DATA data;
         uint32_t *nodes;
         struct timeval start_time;
+       uint32_t culprit = (uint32_t)-1;
  
         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
  
@@ -1263,7 +1389,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
                         return -1;
                 }
                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
-               DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
+               DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
         }
  
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
@@ -1301,6 +1427,11 @@ static int do_recovery(struct ctdb_recoverd *rec,
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
  
  
+       /* update all other nodes to use the same setting for reclock files
+          as the local recovery master.
+       */
+       sync_recovery_lock_file_across_cluster(rec);
+
         /* set recovery mode to active on all nodes */
         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
         if (ret != 0) {
@@ -1315,6 +1446,23 @@ static int do_recovery(struct ctdb_recoverd *rec,
                 return -1;
         }
  
+       /*
+         update all nodes to have the same flags that we have
+        */
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+                       return -1;
+               }
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
         /* pick a new generation number */
         generation = new_generation();
  
@@ -1360,7 +1508,11 @@ static int do_recovery(struct ctdb_recoverd *rec,
         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
  
         for (i=0;i<dbmap->num;i++) {
-               if (recover_database(rec, mem_ctx, dbmap->dbs[i].dbid, pnn, nodemap, generation) != 0) {
+               ret = recover_database(rec, mem_ctx,
+                                      dbmap->dbs[i].dbid,
+                                      dbmap->dbs[i].persistent,
+                                      pnn, nodemap, generation);
+               if (ret != 0) {
                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
                         return -1;
                 }
@@ -1468,6 +1620,12 @@ static int do_recovery(struct ctdb_recoverd *rec,
         /*
           tell nodes to takeover their public IPs
          */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               return -1;
+       }
         rec->need_takeover_run = false;
         ret = ctdb_takeover_run(ctdb, nodemap);
         if (ret != 0) {
@@ -1487,7 +1645,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
  
         /* send a message to all clients telling them that the cluster 
            has been reconfigured */
-       ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
  
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
  
@@ -1518,9 +1676,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
            We now wait for rerecovery_timeout before we allow 
            another recovery to take place.
         */
-       DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
+       DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
-       DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
+       DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
  
         return 0;
  }
@@ -1648,7 +1806,7 @@ static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool u
  
         /* send an election message to all active nodes */
         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
-       ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
  
  
         /* A new node that is already frozen has entered the cluster.
@@ -1746,7 +1904,7 @@ static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
  
  DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));           
  
-       ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+       ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
                 talloc_free(tmp_ctx);
@@ -1779,6 +1937,29 @@ static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
         rec->ip_check_disable_ctx = NULL;
  }
  
+
+static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       struct ctdb_public_ip *ip;
+
+       if (rec->recmaster != rec->ctdb->pnn) {
+               DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
+               return;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_public_ip)) {
+               DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
+               return;
+       }
+
+       ip = (struct ctdb_public_ip *)data.dptr;
+
+       update_ip_assignment_tree(rec->ctdb, ip);
+}
+
+
  static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid, 
                              TDB_DATA data, void *private_data)
  {
@@ -1848,17 +2029,41 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
         TDB_DATA result;
         int32_t ret;
         struct ip_reallocate_list *callers;
+       uint32_t culprit;
  
         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
-       ret = ctdb_takeover_run(ctdb, rec->nodemap);
+
+       /* update the list of public ips that a node can handle for
+          all connected nodes
+       */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               rec->need_takeover_run = true;
+       }
+       if (ret == 0) {
+               ret = ctdb_takeover_run(ctdb, rec->nodemap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                        culprit));
+                       rec->need_takeover_run = true;
+               }
+       }
+
         result.dsize = sizeof(int32_t);
         result.dptr  = (uint8_t *)&ret;
  
         for (callers=rec->reallocate_callers; callers; callers=callers->next) {
+
+               /* Someone that sent srvid==0 does not want a reply */
+               if (callers->rd->srvid == 0) {
+                       continue;
+               }
                 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
                                   "%u:%llu\n", (unsigned)callers->rd->pnn,
                                   (unsigned long long)callers->rd->srvid));
-               ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
+               ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
                                          "message to %u:%llu\n",
@@ -1889,6 +2094,8 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
         /* we got an election packet - update the timeout for the election */
         talloc_free(rec->election_timeout);
         rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                 ctdb_election_timeout, rec);
  
@@ -1956,6 +2163,8 @@ static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
  
         talloc_free(rec->election_timeout);
         rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                 ctdb_election_timeout, rec);
  
@@ -2056,11 +2265,47 @@ static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
  {
         int ret;
         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+       struct ctdb_node_map *nodemap=NULL;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t recmaster;
+       uint32_t *nodes;
  
-       ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), c->pnn, c->new_flags, ~c->new_flags);
+       /* find the recovery master */
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
         if (ret != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       /* read the node flags from the recmaster */
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
+               talloc_free(tmp_ctx);
+               return;
         }
+       if (c->pnn >= nodemap->num) {
+               DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       /* send the flags update to all connected nodes */
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+                                     nodes, 0, CONTROL_TIMEOUT(),
+                                     false, data,
+                                     NULL, NULL,
+                                     NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
+
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       talloc_free(tmp_ctx);
  }
  
  
@@ -2248,15 +2493,18 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct
  }
  
  
-/* called to check that the allocation of public ip addresses is ok.
+/* called to check that the local allocation of public ip addresses is ok.
  */
-static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
  {
         TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct ctdb_control_get_ifaces *ifaces = NULL;
         struct ctdb_all_public_ips *ips = NULL;
         struct ctdb_uptime *uptime1 = NULL;
         struct ctdb_uptime *uptime2 = NULL;
         int ret, j;
+       bool need_iface_check = false;
+       bool need_takeover_run = false;
  
         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
                                 CTDB_CURRENT_NODE, &uptime1);
@@ -2266,6 +2514,30 @@ static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
                 return -1;
         }
  
+
+       /* read the interfaces from the local node */
+       ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       if (!rec->ifaces) {
+               need_iface_check = true;
+       } else if (rec->ifaces->num != ifaces->num) {
+               need_iface_check = true;
+       } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
+               need_iface_check = true;
+       }
+
+       if (need_iface_check) {
+               DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
+                                    "local node %u - force takeover run\n",
+                                    pnn));
+               need_takeover_run = true;
+       }
+
         /* read the ip allocation from the local node */
         ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
         if (ret != 0) {
@@ -2307,55 +2579,55 @@ static int verify_ip_allocation(struct ctdb_context *ctdb, uint32_t pnn)
                 return 0;
         }
  
+       talloc_free(rec->ifaces);
+       rec->ifaces = talloc_steal(rec, ifaces);
+
         /* verify that we have the ip addresses we should have
            and we dont have ones we shouldnt have.
            if we find an inconsistency we set recmode to
            active on the local node and wait for the recmaster
-          to do a full blown recovery
+          to do a full blown recovery.
+          also if the pnn is -1 and we are healthy and can host the ip
+          we also request a ip reallocation.
         */
-       for (j=0; j<ips->num; j++) {
-               if (ips->ips[j].pnn == pnn) {
-                       if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
-                               ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
-
-                                       talloc_free(mem_ctx);
-                                       return -1;
+       if (ctdb->tunable.disable_ip_failover == 0) {
+               for (j=0; j<ips->num; j++) {
+                       if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
+                               DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                               need_takeover_run = true;
+                       } else if (ips->ips[j].pnn == pnn) {
+                               if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
                                 }
-                               ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
-
-                                       talloc_free(mem_ctx);
-                                       return -1;
+                       } else {
+                               if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
                                 }
                         }
-               } else {
-                       if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
+               }
+       }
  
-                               ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to public ip address mismatches\n"));
+       if (need_takeover_run) {
+               struct takeover_run_reply rd;
+               TDB_DATA data;
  
-                                       talloc_free(mem_ctx);
-                                       return -1;
-                               }
-                               ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to public ip address mismatches\n"));
+               DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
  
-                                       talloc_free(mem_ctx);
-                                       return -1;
-                               }
-                       }
+               rd.pnn = ctdb->pnn;
+               rd.srvid = 0;
+               data.dptr = (uint8_t *)&rd;
+               data.dsize = sizeof(rd);
+
+               ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
                 }
         }
-
         talloc_free(mem_ctx);
         return 0;
  }
@@ -2512,6 +2784,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                 close(state->fd[0]);
                 state->fd[0] = -1;
  
+               debug_extra = talloc_asprintf(NULL, "recovery-lock:");
                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
                         cc = RECLOCK_FAILED;
@@ -2542,7 +2815,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
         }
  
         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
-                               EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+                               EVENT_FD_READ,
                                 reclock_child_handler,
                                 (void *)state);
  
@@ -2551,6 +2824,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                 talloc_free(state);
                 return -1;
         }
+       tevent_fd_set_auto_close(state->fde);
  
         while (state->status == RECLOCK_CHECKING) {
                 event_loop_once(ctdb->ev);
@@ -2621,14 +2895,11 @@ static int update_recovery_lock_file(struct ctdb_context *ctdb)
         talloc_free(tmp_ctx);
         return 0;
  }
-               
-/*
-  the main monitoring loop
- */
-static void monitor_cluster(struct ctdb_context *ctdb)
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+                     TALLOC_CTX *mem_ctx)
  {
         uint32_t pnn;
-       TALLOC_CTX *mem_ctx=NULL;
         struct ctdb_node_map *nodemap=NULL;
         struct ctdb_node_map *recmaster_nodemap=NULL;
         struct ctdb_node_map **remote_nodemaps=NULL;
@@ -2636,54 +2907,8 @@ static void monitor_cluster(struct ctdb_context *ctdb)
         struct ctdb_vnn_map *remote_vnnmap=NULL;
         int32_t debug_level;
         int i, j, ret;
-       struct ctdb_recoverd *rec;
-
-       DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
-
-       rec = talloc_zero(ctdb, struct ctdb_recoverd);
-       CTDB_NO_MEMORY_FATAL(ctdb, rec);
  
-       rec->ctdb = ctdb;
-
-       rec->priority_time = timeval_current();
  
-       /* register a message port for sending memory dumps */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
-
-       /* register a message port for recovery elections */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
-
-       /* when nodes are disabled/enabled */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
-
-       /* when we are asked to puch out a flag change */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
-
-       /* register a message port for vacuum fetch */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
-
-       /* register a message port for reloadnodes  */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
-
-       /* register a message port for performing a takeover run */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
-
-       /* register a message port for disabling the ip check for a short while */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
-
-again:
-       if (mem_ctx) {
-               talloc_free(mem_ctx);
-               mem_ctx = NULL;
-       }
-       mem_ctx = talloc_new(ctdb);
-       if (!mem_ctx) {
-               DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
-               exit(-1);
-       }
-
-       /* we only check for recovery once every second */
-       ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
  
         /* verify that the main daemon is still running */
         if (kill(ctdb->ctdbd_pid, 0) != 0) {
@@ -2696,14 +2921,14 @@ again:
  
         if (rec->election_timeout) {
                 /* an election is in progress */
-               goto again;
+               return;
         }
  
         /* read the debug level from the parent and update locally */
         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
         if (ret !=0) {
                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
-               goto again;
+               return;
         }
         LogLevel = debug_level;
  
@@ -2733,13 +2958,13 @@ again:
         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
-               goto again;
+               return;
         }
  
         /* get the current recovery lock file from the server */
         if (update_recovery_lock_file(ctdb) != 0) {
                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
-               goto again;
+               return;
         }
  
         /* Make sure that if recovery lock verification becomes disabled when
@@ -2755,14 +2980,14 @@ again:
         pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
         if (pnn == (uint32_t)-1) {
                 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
-               goto again;
+               return;
         }
  
         /* get the vnnmap */
         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
-               goto again;
+               return;
         }
  
  
@@ -2775,7 +3000,7 @@ again:
         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
-               goto again;
+               return;
         }
         nodemap = rec->nodemap;
  
@@ -2783,7 +3008,7 @@ again:
         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
-               goto again;
+               return;
         }
  
         /* if we are not the recmaster we can safely ignore any ip reallocate requests */
@@ -2802,7 +3027,7 @@ again:
         if (rec->recmaster == (uint32_t)-1) {
                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
  
@@ -2820,15 +3045,15 @@ again:
                         ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
-                               goto again;
+                               return;
                         }
                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
  
-                               goto again;
+                               return;
                         }
-                       goto again;
+                       return;
                 }
         }
         /* If the local node is stopped, verify we are not the recmaster 
@@ -2837,7 +3062,7 @@ again:
         if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
                 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
         
         /* check that we (recovery daemon) and the local ctdb daemon
@@ -2871,14 +3096,14 @@ again:
         if (j == nodemap->num) {
                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
         /* if recovery master is disconnected we must elect a new recmaster */
         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
         /* grap the nodemap from the recovery master to check if it is banned */
@@ -2887,25 +3112,24 @@ again:
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n", 
                           nodemap->nodes[j].pnn));
-               goto again;
+               return;
         }
  
  
         if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
  
         /* verify that we have all ip addresses we should have and we dont
          * have addresses we shouldnt have.
          */ 
-       if (ctdb->do_checkpublicip) {
+       if (ctdb->tunable.disable_ip_failover == 0) {
                 if (rec->ip_check_disable_ctx == NULL) {
-                       if (verify_ip_allocation(ctdb, pnn) != 0) {
+                       if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
                                 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
-                               goto again;
                         }
                 }
         }
@@ -2915,7 +3139,7 @@ again:
            if recovery is needed
          */
         if (pnn != rec->recmaster) {
-               goto again;
+               return;
         }
  
  
@@ -2924,63 +3148,38 @@ again:
         if (ret == MONITOR_ELECTION_NEEDED) {
                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
         if (ret != MONITOR_OK) {
                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
-               goto again;
+               return;
         }
  
-       /* update the list of public ips that a node can handle for
-          all connected nodes
-       */
         if (ctdb->num_nodes != nodemap->num) {
                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
                 reload_nodes_file(ctdb);
-               goto again;
-       }
-       for (j=0; j<nodemap->num; j++) {
-               /* release any existing data */
-               if (ctdb->nodes[j]->public_ips) {
-                       talloc_free(ctdb->nodes[j]->public_ips);
-                       ctdb->nodes[j]->public_ips = NULL;
-               }
-
-               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-                       continue;
-               }
-
-               /* grab a new shiny list of public ips from the node */
-               if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
-                       ctdb->nodes[j]->pnn, 
-                       ctdb->nodes,
-                       &ctdb->nodes[j]->public_ips)) {
-                       DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n", 
-                               ctdb->nodes[j]->pnn));
-                       goto again;
-               }
+               return;
         }
  
-
         /* verify that all active nodes agree that we are the recmaster */
         switch (verify_recmaster(rec, nodemap, pnn)) {
         case MONITOR_RECOVERY_NEEDED:
                 /* can not happen */
-               goto again;
+               return;
         case MONITOR_ELECTION_NEEDED:
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         case MONITOR_OK:
                 break;
         case MONITOR_FAILED:
-               goto again;
+               return;
         }
  
  
         if (rec->need_recovery) {
                 /* a previous recovery didn't finish */
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;             
+               return;
         }
  
         /* verify that all active nodes are in normal mode 
@@ -2989,9 +3188,9 @@ again:
         switch (verify_recmode(ctdb, nodemap)) {
         case MONITOR_RECOVERY_NEEDED:
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;
+               return;
         case MONITOR_FAILED:
-               goto again;
+               return;
         case MONITOR_ELECTION_NEEDED:
                 /* can not happen */
         case MONITOR_OK:
@@ -3006,7 +3205,7 @@ again:
                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
                         ctdb_set_culprit(rec, ctdb->pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
         }
  
@@ -3015,14 +3214,14 @@ again:
         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
         if (remote_nodemaps == NULL) {
                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
-               goto again;
+               return;
         }
         for(i=0; i<nodemap->num; i++) {
                 remote_nodemaps[i] = NULL;
         }
         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
-               goto again;
+               return;
         } 
  
         /* verify that all other nodes have the same nodemap as we have
@@ -3036,7 +3235,7 @@ again:
                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
                         ctdb_set_culprit(rec, j);
  
-                       goto again;
+                       return;
                 }
  
                 /* if the nodes disagree on how many nodes there are
@@ -3047,7 +3246,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* if the nodes disagree on which nodes exist and are
@@ -3061,7 +3260,7 @@ again:
                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                 do_recovery(rec, mem_ctx, pnn, nodemap, 
                                             vnnmap);
-                               goto again;
+                               return;
                         }
                 }
  
@@ -3084,14 +3283,14 @@ again:
                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                         do_recovery(rec, mem_ctx, pnn, nodemap, 
                                                     vnnmap);
-                                       goto again;
+                                       return;
                                 } else {
                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                         do_recovery(rec, mem_ctx, pnn, nodemap, 
                                                     vnnmap);
-                                       goto again;
+                                       return;
                                 }
                         }
                 }
@@ -3106,7 +3305,7 @@ again:
                           vnnmap->size, rec->num_active));
                 ctdb_set_culprit(rec, ctdb->pnn);
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;
+               return;
         }
  
         /* verify that all active nodes in the nodemap also exist in 
@@ -3130,7 +3329,7 @@ again:
                                   nodemap->nodes[j].pnn));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
         }
  
@@ -3151,7 +3350,7 @@ again:
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n", 
                                   nodemap->nodes[j].pnn));
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap generation is the same */
@@ -3160,7 +3359,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap size is the same */
@@ -3169,7 +3368,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap is the same */
@@ -3180,21 +3379,36 @@ again:
                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                 do_recovery(rec, mem_ctx, pnn, nodemap, 
                                             vnnmap);
-                               goto again;
+                               return;
                         }
                 }
         }
  
         /* we might need to change who has what IP assigned */
         if (rec->need_takeover_run) {
+               uint32_t culprit = (uint32_t)-1;
+
                 rec->need_takeover_run = false;
  
+               /* update the list of public ips that a node can handle for
+                  all connected nodes
+               */
+               ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                        culprit));
+                       ctdb_set_culprit(rec, culprit);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+
                 /* execute the "startrecovery" event script on all nodes */
                 ret = run_startrecovery_eventscript(rec, nodemap);
                 if (ret!=0) {
                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
                         ctdb_set_culprit(rec, ctdb->pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
                 }
  
                 ret = ctdb_takeover_run(ctdb, nodemap);
@@ -3202,6 +3416,7 @@ again:
                         DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
                         ctdb_set_culprit(rec, ctdb->pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
                 }
  
                 /* execute the "recovered" event script on all nodes */
@@ -3218,10 +3433,73 @@ again:
                 }
  #endif
         }
+}
  
+/*
+  the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+       struct ctdb_recoverd *rec;
+
+       DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+       rec = talloc_zero(ctdb, struct ctdb_recoverd);
+       CTDB_NO_MEMORY_FATAL(ctdb, rec);
  
-       goto again;
+       rec->ctdb = ctdb;
+
+       rec->priority_time = timeval_current();
+
+       /* register a message port for sending memory dumps */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+       /* register a message port for recovery elections */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
+
+       /* when nodes are disabled/enabled */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
  
+       /* when we are asked to puch out a flag change */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+       /* register a message port for vacuum fetch */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
+
+       /* register a message port for reloadnodes  */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+       /* register a message port for performing a takeover run */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+       /* register a message port for disabling the ip check for a short while */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+       /* register a message port for updating the recovery daemons node assignment for an ip */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
+
+       for (;;) {
+               TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+               struct timeval start;
+               double elapsed;
+
+               if (!mem_ctx) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         " Failed to create temp context\n"));
+                       exit(-1);
+               }
+
+               start = timeval_current();
+               main_loop(ctdb, rec, mem_ctx);
+               talloc_free(mem_ctx);
+
+               /* we only check for recovery once every second */
+               elapsed = timeval_elapsed(&start);
+               if (elapsed < ctdb->tunable.recover_interval) {
+                       ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+                                         - elapsed);
+               }
+       }
  }
  
  /*
@@ -3252,7 +3530,7 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
                 if (ctdb->methods != NULL) {
                         ctdb->methods->shutdown(ctdb);
                 }
-               ctdb_event_script(ctdb, "shutdown");
+               ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
  
                 exit(10);       
         }
@@ -3292,6 +3570,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
  {
         int fd[2];
         struct signal_event *se;
+       struct tevent_fd *fde;
  
         if (pipe(fd) != 0) {
                 return -1;
@@ -3316,15 +3595,16 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
  
         srandom(getpid() ^ time(NULL));
  
-       if (switch_from_server_to_client(ctdb) != 0) {
+       if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
                 exit(1);
         }
  
-       DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
  
-       event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE, 
+       fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
                      ctdb_recoverd_parent, &fd[0]);     
+       tevent_fd_set_auto_close(fde);
  
         /* set up a handler to pick up sigchld */
         se = event_add_signal(ctdb->ev, ctdb,