Merge branch 'master-readonly-records' into foo

[sahlberg/ctdb.git] / server / ctdb_recoverd.c
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c

index 3e596da9ecad9d88c8aa99184e4c94dac10a7720..631f53e89bd4d04908b4b3abc8b1da761d86163a 100644 (file)
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -18,14 +18,14 @@
  */
  
  #include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
  #include "system/filesys.h"
  #include "system/time.h"
  #include "system/network.h"
  #include "system/wait.h"
  #include "popt.h"
  #include "cmdline.h"
-#include "../include/ctdb.h"
+#include "../include/ctdb_client.h"
  #include "../include/ctdb_private.h"
  #include "db_wrap.h"
  #include "dlinklist.h"
@@ -64,11 +64,13 @@ struct ctdb_recoverd {
         TALLOC_CTX *ip_reallocate_ctx;
         struct ip_reallocate_list *reallocate_callers;
         TALLOC_CTX *ip_check_disable_ctx;
+       struct ctdb_control_get_ifaces *ifaces;
  };
  
  #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
  #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
  
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
  
  /*
    ban a node for a period of time
@@ -211,7 +213,7 @@ static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_
  static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
  {
         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
-               DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
+               DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
                 return;
         }
         if (node_pnn < ctdb->num_nodes) {
@@ -437,7 +439,8 @@ static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctd
                                 return -1;
                         }
                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
-                                          mem_ctx, name, dbmap->dbs[db].persistent);
+                                          mem_ctx, name,
+                                          dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
                                 return -1;
@@ -500,7 +503,7 @@ static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb
                                 return -1;
                         }
                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name, 
-                                          remote_dbmap->dbs[db].persistent);
+                                          remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
                                 return -1;
@@ -712,6 +715,7 @@ static void vacuum_fetch_next(struct vacuum_info *v)
                 ZERO_STRUCT(call);
                 call.call_id = CTDB_NULL_FUNC;
                 call.flags = CTDB_IMMEDIATE_MIGRATION;
+               call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
  
                 r = v->r;
                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
@@ -820,7 +824,7 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
  
         for (i=0;i<dbmap->num;i++) {
                 if (dbmap->dbs[i].dbid == recs->db_id) {
-                       persistent = dbmap->dbs[i].persistent;
+                       persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
                         break;
                 }
         }
@@ -838,7 +842,7 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
         }
  
         /* attach to it */
-       ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
+       ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
         if (ctdb_db == NULL) {
                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
                 talloc_free(tmp_ctx);
@@ -886,10 +890,11 @@ static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
  /*
    wait for a given number of seconds
   */
-static void ctdb_wait_timeout(struct ctdb_context *ctdb, uint32_t secs)
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
  {
         uint32_t timed_out = 0;
-       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, 0), ctdb_wait_handler, &timed_out);
+       time_t usecs = (secs - (time_t)secs) * 1000000;
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
         while (!timed_out) {
                 event_loop_once(ctdb->ev);
         }
@@ -903,6 +908,7 @@ static void ctdb_election_timeout(struct event_context *ev, struct timed_event *
  {
         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
         rec->election_timeout = NULL;
+       fast_start = false;
  
         DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
  }
@@ -1010,16 +1016,19 @@ static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_
         unsigned tdb_flags;
  
         /* open up the temporary recovery database */
-       name = talloc_asprintf(mem_ctx, "%s/recdb.tdb", ctdb->db_directory);
+       name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
+                              ctdb->db_directory_state,
+                              ctdb->pnn);
         if (name == NULL) {
                 return NULL;
         }
         unlink(name);
  
         tdb_flags = TDB_NOLOCK;
-       if (!ctdb->do_setsched) {
+       if (ctdb->valgrinding) {
                 tdb_flags |= TDB_NOMMAP;
         }
+       tdb_flags |= TDB_DISALLOW_NESTING;
  
         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size, 
                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
@@ -1059,6 +1068,7 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
         hdr = (struct ctdb_ltdb_header *)data.dptr;
         if (!params->persistent) {
                 hdr->dmaster = params->ctdb->pnn;
+               hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
         }
  
         /* add the record to the blob ready to send to the nodes */
@@ -1223,7 +1233,118 @@ static void reload_nodes_file(struct ctdb_context *ctdb)
         ctdb_load_nodes_file(ctdb);
  }
  
-       
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+                                        struct ctdb_recoverd *rec,
+                                        struct ctdb_node_map *nodemap,
+                                        uint32_t *culprit)
+{
+       int j;
+       int ret;
+
+       if (ctdb->num_nodes != nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
+                                 ctdb->num_nodes, nodemap->num));
+               if (culprit) {
+                       *culprit = ctdb->pnn;
+               }
+               return -1;
+       }
+
+       for (j=0; j<nodemap->num; j++) {
+               /* release any existing data */
+               if (ctdb->nodes[j]->known_public_ips) {
+                       talloc_free(ctdb->nodes[j]->known_public_ips);
+                       ctdb->nodes[j]->known_public_ips = NULL;
+               }
+               if (ctdb->nodes[j]->available_public_ips) {
+                       talloc_free(ctdb->nodes[j]->available_public_ips);
+                       ctdb->nodes[j]->available_public_ips = NULL;
+               }
+
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               /* grab a new shiny list of public ips from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       ctdb->nodes[j]->pnn,
+                                       ctdb->nodes,
+                                       0,
+                                       &ctdb->nodes[j]->known_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
+                               ctdb->nodes[j]->pnn));
+                       if (culprit) {
+                               *culprit = ctdb->nodes[j]->pnn;
+                       }
+                       return -1;
+               }
+
+               if (ctdb->tunable.disable_ip_failover == 0) {
+                       if (rec->ip_check_disable_ctx == NULL) {
+                               if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
+                                       DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
+                                       rec->need_takeover_run = true;
+                               }
+                       }
+               }
+
+               /* grab a new shiny list of public ips from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       ctdb->nodes[j]->pnn,
+                                       ctdb->nodes,
+                                       CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+                                       &ctdb->nodes[j]->available_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
+                               ctdb->nodes[j]->pnn));
+                       if (culprit) {
+                               *culprit = ctdb->nodes[j]->pnn;
+                       }
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* when we start a recovery, make sure all nodes use the same reclock file
+   setting
+*/
+static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       TDB_DATA data;
+       uint32_t *nodes;
+
+       if (ctdb->recovery_lock_file == NULL) {
+               data.dptr  = NULL;
+               data.dsize = 0;
+       } else {
+               data.dsize = strlen(ctdb->recovery_lock_file) + 1;
+               data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
+       }
+
+       nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
  /*
    we are the recmaster, and recovery is needed - start a recovery run
   */
@@ -1238,6 +1359,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
         TDB_DATA data;
         uint32_t *nodes;
         struct timeval start_time;
+       uint32_t culprit = (uint32_t)-1;
  
         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
  
@@ -1266,12 +1388,14 @@ static int do_recovery(struct ctdb_recoverd *rec,
                 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
                 start_time = timeval_current();
                 if (!ctdb_recovery_lock(ctdb, true)) {
-                       ctdb_set_culprit(rec, pnn);
-                       DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
+                       DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
+                                        "and ban ourself for %u seconds\n",
+                                        ctdb->tunable.recovery_ban_period));
+                       ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
                         return -1;
                 }
                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
-               DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n"));
+               DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
         }
  
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
@@ -1309,6 +1433,11 @@ static int do_recovery(struct ctdb_recoverd *rec,
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
  
  
+       /* update all other nodes to use the same setting for reclock files
+          as the local recovery master.
+       */
+       sync_recovery_lock_file_across_cluster(rec);
+
         /* set recovery mode to active on all nodes */
         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
         if (ret != 0) {
@@ -1323,6 +1452,23 @@ static int do_recovery(struct ctdb_recoverd *rec,
                 return -1;
         }
  
+       /*
+         update all nodes to have the same flags that we have
+        */
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+                       return -1;
+               }
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
         /* pick a new generation number */
         generation = new_generation();
  
@@ -1370,7 +1516,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
         for (i=0;i<dbmap->num;i++) {
                 ret = recover_database(rec, mem_ctx,
                                        dbmap->dbs[i].dbid,
-                                      dbmap->dbs[i].persistent,
+                                      dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
                                        pnn, nodemap, generation);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
@@ -1480,13 +1626,19 @@ static int do_recovery(struct ctdb_recoverd *rec,
         /*
           tell nodes to takeover their public IPs
          */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               rec->need_takeover_run = true;
+               return -1;
+       }
         rec->need_takeover_run = false;
         ret = ctdb_takeover_run(ctdb, nodemap);
         if (ret != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
-               return -1;
+               DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
+               rec->need_takeover_run = true;
         }
-       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
  
         /* execute the "recovered" event script on all nodes */
         ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
@@ -1499,7 +1651,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
  
         /* send a message to all clients telling them that the cluster 
            has been reconfigured */
-       ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
  
         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
  
@@ -1530,9 +1682,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
            We now wait for rerecovery_timeout before we allow 
            another recovery to take place.
         */
-       DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
+       DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
-       DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
+       DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
  
         return 0;
  }
@@ -1660,7 +1812,7 @@ static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool u
  
         /* send an election message to all active nodes */
         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
-       ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
  
  
         /* A new node that is already frozen has entered the cluster.
@@ -1758,7 +1910,7 @@ static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
  
  DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));           
  
-       ret = ctdb_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+       ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
                 talloc_free(tmp_ctx);
@@ -1791,6 +1943,29 @@ static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
         rec->ip_check_disable_ctx = NULL;
  }
  
+
+static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       struct ctdb_public_ip *ip;
+
+       if (rec->recmaster != rec->ctdb->pnn) {
+               DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
+               return;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_public_ip)) {
+               DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
+               return;
+       }
+
+       ip = (struct ctdb_public_ip *)data.dptr;
+
+       update_ip_assignment_tree(rec->ctdb, ip);
+}
+
+
  static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid, 
                              TDB_DATA data, void *private_data)
  {
@@ -1860,9 +2035,27 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
         TDB_DATA result;
         int32_t ret;
         struct ip_reallocate_list *callers;
+       uint32_t culprit;
  
         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
-       ret = ctdb_takeover_run(ctdb, rec->nodemap);
+
+       /* update the list of public ips that a node can handle for
+          all connected nodes
+       */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               rec->need_takeover_run = true;
+       }
+       if (ret == 0) {
+               ret = ctdb_takeover_run(ctdb, rec->nodemap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
+                       rec->need_takeover_run = true;
+               }
+       }
+
         result.dsize = sizeof(int32_t);
         result.dptr  = (uint8_t *)&ret;
  
@@ -1875,7 +2068,7 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
                 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
                                   "%u:%llu\n", (unsigned)callers->rd->pnn,
                                   (unsigned long long)callers->rd->srvid));
-               ret = ctdb_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
+               ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
                                          "message to %u:%llu\n",
@@ -1906,6 +2099,8 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
         /* we got an election packet - update the timeout for the election */
         talloc_free(rec->election_timeout);
         rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                 ctdb_election_timeout, rec);
  
@@ -1973,6 +2168,8 @@ static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
  
         talloc_free(rec->election_timeout);
         rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                 ctdb_election_timeout, rec);
  
@@ -2301,15 +2498,18 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct
  }
  
  
-/* called to check that the allocation of public ip addresses is ok.
+/* called to check that the local allocation of public ip addresses is ok.
  */
-static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
  {
         TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct ctdb_control_get_ifaces *ifaces = NULL;
         struct ctdb_all_public_ips *ips = NULL;
         struct ctdb_uptime *uptime1 = NULL;
         struct ctdb_uptime *uptime2 = NULL;
         int ret, j;
+       bool need_iface_check = false;
+       bool need_takeover_run = false;
  
         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
                                 CTDB_CURRENT_NODE, &uptime1);
@@ -2319,6 +2519,30 @@ static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd
                 return -1;
         }
  
+
+       /* read the interfaces from the local node */
+       ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       if (!rec->ifaces) {
+               need_iface_check = true;
+       } else if (rec->ifaces->num != ifaces->num) {
+               need_iface_check = true;
+       } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
+               need_iface_check = true;
+       }
+
+       if (need_iface_check) {
+               DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
+                                    "local node %u - force takeover run\n",
+                                    pnn));
+               need_takeover_run = true;
+       }
+
         /* read the ip allocation from the local node */
         ret = ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
         if (ret != 0) {
@@ -2354,58 +2578,61 @@ static int verify_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd
         /* skip the check if we have started but not finished recovery */
         if (timeval_compare(&uptime1->last_recovery_finished,
                             &uptime1->last_recovery_started) != 1) {
-               DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
+               DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
                 talloc_free(mem_ctx);
  
                 return 0;
         }
  
+       talloc_free(rec->ifaces);
+       rec->ifaces = talloc_steal(rec, ifaces);
+
         /* verify that we have the ip addresses we should have
            and we dont have ones we shouldnt have.
            if we find an inconsistency we set recmode to
            active on the local node and wait for the recmaster
-          to do a full blown recovery
+          to do a full blown recovery.
+          also if the pnn is -1 and we are healthy and can host the ip
+          we also request a ip reallocation.
         */
-       for (j=0; j<ips->num; j++) {
-               if (ips->ips[j].pnn == pnn) {
-                       if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               struct takeover_run_reply rd;
-                               TDB_DATA data;
-
-                               DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
-
-                               rd.pnn   = ctdb->pnn;
-                               rd.srvid = 0;
-                               data.dptr = (uint8_t *)&rd;
-                               data.dsize = sizeof(rd);
-
-                               ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+       if (ctdb->tunable.disable_ip_failover == 0) {
+               for (j=0; j<ips->num; j++) {
+                       if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
+                               DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                               need_takeover_run = true;
+                       } else if (ips->ips[j].pnn == pnn) {
+                               if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
                                 }
-                       }
-               } else {
-                       if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               struct takeover_run_reply rd;
-                               TDB_DATA data;
-
-                               DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
-
-                               rd.pnn   = ctdb->pnn;
-                               rd.srvid = 0;
-                               data.dptr = (uint8_t *)&rd;
-                               data.dsize = sizeof(rd);
-
-                               ret = ctdb_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
-                               if (ret != 0) {
-                                       DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+                       } else {
+                               if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
                                 }
                         }
                 }
         }
  
+       if (need_takeover_run) {
+               struct takeover_run_reply rd;
+               TDB_DATA data;
+
+               DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
+
+               rd.pnn = ctdb->pnn;
+               rd.srvid = 0;
+               data.dptr = (uint8_t *)&rd;
+               data.dsize = sizeof(rd);
+
+               ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+               }
+       }
         talloc_free(mem_ctx);
         return 0;
  }
@@ -2546,7 +2773,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                 return -1;
         }
  
-       state->child = fork();
+       state->child = ctdb_fork(ctdb);
         if (state->child == (pid_t)-1) {
                 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
                 close(state->fd[0]);
@@ -2562,6 +2789,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                 close(state->fd[0]);
                 state->fd[0] = -1;
  
+               debug_extra = talloc_asprintf(NULL, "recovery-lock:");
                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
                         cc = RECLOCK_FAILED;
@@ -2592,7 +2820,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
         }
  
         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
-                               EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+                               EVENT_FD_READ,
                                 reclock_child_handler,
                                 (void *)state);
  
@@ -2601,6 +2829,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                 talloc_free(state);
                 return -1;
         }
+       tevent_fd_set_auto_close(state->fde);
  
         while (state->status == RECLOCK_CHECKING) {
                 event_loop_once(ctdb->ev);
@@ -2671,14 +2900,11 @@ static int update_recovery_lock_file(struct ctdb_context *ctdb)
         talloc_free(tmp_ctx);
         return 0;
  }
-               
-/*
-  the main monitoring loop
- */
-static void monitor_cluster(struct ctdb_context *ctdb)
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+                     TALLOC_CTX *mem_ctx)
  {
         uint32_t pnn;
-       TALLOC_CTX *mem_ctx=NULL;
         struct ctdb_node_map *nodemap=NULL;
         struct ctdb_node_map *recmaster_nodemap=NULL;
         struct ctdb_node_map **remote_nodemaps=NULL;
@@ -2686,54 +2912,8 @@ static void monitor_cluster(struct ctdb_context *ctdb)
         struct ctdb_vnn_map *remote_vnnmap=NULL;
         int32_t debug_level;
         int i, j, ret;
-       struct ctdb_recoverd *rec;
  
-       DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
  
-       rec = talloc_zero(ctdb, struct ctdb_recoverd);
-       CTDB_NO_MEMORY_FATAL(ctdb, rec);
-
-       rec->ctdb = ctdb;
-
-       rec->priority_time = timeval_current();
-
-       /* register a message port for sending memory dumps */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
-
-       /* register a message port for recovery elections */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
-
-       /* when nodes are disabled/enabled */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
-
-       /* when we are asked to puch out a flag change */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
-
-       /* register a message port for vacuum fetch */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
-
-       /* register a message port for reloadnodes  */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
-
-       /* register a message port for performing a takeover run */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
-
-       /* register a message port for disabling the ip check for a short while */
-       ctdb_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
-
-again:
-       if (mem_ctx) {
-               talloc_free(mem_ctx);
-               mem_ctx = NULL;
-       }
-       mem_ctx = talloc_new(ctdb);
-       if (!mem_ctx) {
-               DEBUG(DEBUG_CRIT,(__location__ " Failed to create temporary context\n"));
-               exit(-1);
-       }
-
-       /* we only check for recovery once every second */
-       ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
  
         /* verify that the main daemon is still running */
         if (kill(ctdb->ctdbd_pid, 0) != 0) {
@@ -2746,14 +2926,14 @@ again:
  
         if (rec->election_timeout) {
                 /* an election is in progress */
-               goto again;
+               return;
         }
  
         /* read the debug level from the parent and update locally */
         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
         if (ret !=0) {
                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
-               goto again;
+               return;
         }
         LogLevel = debug_level;
  
@@ -2783,13 +2963,13 @@ again:
         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
-               goto again;
+               return;
         }
  
         /* get the current recovery lock file from the server */
         if (update_recovery_lock_file(ctdb) != 0) {
                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
-               goto again;
+               return;
         }
  
         /* Make sure that if recovery lock verification becomes disabled when
@@ -2805,14 +2985,14 @@ again:
         pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
         if (pnn == (uint32_t)-1) {
                 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
-               goto again;
+               return;
         }
  
         /* get the vnnmap */
         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
-               goto again;
+               return;
         }
  
  
@@ -2825,7 +3005,7 @@ again:
         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
-               goto again;
+               return;
         }
         nodemap = rec->nodemap;
  
@@ -2833,7 +3013,7 @@ again:
         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
-               goto again;
+               return;
         }
  
         /* if we are not the recmaster we can safely ignore any ip reallocate requests */
@@ -2844,15 +3024,11 @@ again:
                         rec->reallocate_callers = NULL;
                 }
         }
-       /* if there are takeovers requested, perform it and notify the waiters */
-       if (rec->reallocate_callers) {
-               process_ipreallocate_requests(ctdb, rec);
-       }
  
         if (rec->recmaster == (uint32_t)-1) {
                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
  
@@ -2870,15 +3046,15 @@ again:
                         ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node due to node being STOPPED\n"));
-                               goto again;
+                               return;
                         }
                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
                         if (ret != 0) {
                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode due to node being stopped\n"));
  
-                               goto again;
+                               return;
                         }
-                       goto again;
+                       return;
                 }
         }
         /* If the local node is stopped, verify we are not the recmaster 
@@ -2887,7 +3063,7 @@ again:
         if ((nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) && (rec->recmaster == pnn)) {
                 DEBUG(DEBUG_ERR,("Local node is STOPPED. Yielding recmaster role\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
         
         /* check that we (recovery daemon) and the local ctdb daemon
@@ -2921,14 +3097,14 @@ again:
         if (j == nodemap->num) {
                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
         /* if recovery master is disconnected we must elect a new recmaster */
         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
         /* grap the nodemap from the recovery master to check if it is banned */
@@ -2937,23 +3113,23 @@ again:
         if (ret != 0) {
                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n", 
                           nodemap->nodes[j].pnn));
-               goto again;
+               return;
         }
  
  
         if (recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
  
  
         /* verify that we have all ip addresses we should have and we dont
          * have addresses we shouldnt have.
          */ 
-       if (ctdb->do_checkpublicip) {
+       if (ctdb->tunable.disable_ip_failover == 0) {
                 if (rec->ip_check_disable_ctx == NULL) {
-                       if (verify_ip_allocation(ctdb, rec, pnn) != 0) {
+                       if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
                                 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
                         }
                 }
@@ -2964,7 +3140,7 @@ again:
            if recovery is needed
          */
         if (pnn != rec->recmaster) {
-               goto again;
+               return;
         }
  
  
@@ -2973,63 +3149,38 @@ again:
         if (ret == MONITOR_ELECTION_NEEDED) {
                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         }
         if (ret != MONITOR_OK) {
                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
-               goto again;
+               return;
         }
  
-       /* update the list of public ips that a node can handle for
-          all connected nodes
-       */
         if (ctdb->num_nodes != nodemap->num) {
                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
                 reload_nodes_file(ctdb);
-               goto again;
-       }
-       for (j=0; j<nodemap->num; j++) {
-               /* release any existing data */
-               if (ctdb->nodes[j]->public_ips) {
-                       talloc_free(ctdb->nodes[j]->public_ips);
-                       ctdb->nodes[j]->public_ips = NULL;
-               }
-
-               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-                       continue;
-               }
-
-               /* grab a new shiny list of public ips from the node */
-               if (ctdb_ctrl_get_public_ips(ctdb, CONTROL_TIMEOUT(),
-                       ctdb->nodes[j]->pnn, 
-                       ctdb->nodes,
-                       &ctdb->nodes[j]->public_ips)) {
-                       DEBUG(DEBUG_ERR,("Failed to read public ips from node : %u\n", 
-                               ctdb->nodes[j]->pnn));
-                       goto again;
-               }
+               return;
         }
  
-
         /* verify that all active nodes agree that we are the recmaster */
         switch (verify_recmaster(rec, nodemap, pnn)) {
         case MONITOR_RECOVERY_NEEDED:
                 /* can not happen */
-               goto again;
+               return;
         case MONITOR_ELECTION_NEEDED:
                 force_election(rec, pnn, nodemap);
-               goto again;
+               return;
         case MONITOR_OK:
                 break;
         case MONITOR_FAILED:
-               goto again;
+               return;
         }
  
  
         if (rec->need_recovery) {
                 /* a previous recovery didn't finish */
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;             
+               return;
         }
  
         /* verify that all active nodes are in normal mode 
@@ -3038,9 +3189,9 @@ again:
         switch (verify_recmode(ctdb, nodemap)) {
         case MONITOR_RECOVERY_NEEDED:
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;
+               return;
         case MONITOR_FAILED:
-               goto again;
+               return;
         case MONITOR_ELECTION_NEEDED:
                 /* can not happen */
         case MONITOR_OK:
@@ -3055,23 +3206,28 @@ again:
                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
                         ctdb_set_culprit(rec, ctdb->pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
         }
  
+       /* if there are takeovers requested, perform it and notify the waiters */
+       if (rec->reallocate_callers) {
+               process_ipreallocate_requests(ctdb, rec);
+       }
+
         /* get the nodemap for all active remote nodes
          */
         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
         if (remote_nodemaps == NULL) {
                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
-               goto again;
+               return;
         }
         for(i=0; i<nodemap->num; i++) {
                 remote_nodemaps[i] = NULL;
         }
         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
-               goto again;
+               return;
         } 
  
         /* verify that all other nodes have the same nodemap as we have
@@ -3085,7 +3241,7 @@ again:
                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
                         ctdb_set_culprit(rec, j);
  
-                       goto again;
+                       return;
                 }
  
                 /* if the nodes disagree on how many nodes there are
@@ -3096,7 +3252,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* if the nodes disagree on which nodes exist and are
@@ -3110,7 +3266,7 @@ again:
                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                 do_recovery(rec, mem_ctx, pnn, nodemap, 
                                             vnnmap);
-                               goto again;
+                               return;
                         }
                 }
  
@@ -3133,14 +3289,14 @@ again:
                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                         do_recovery(rec, mem_ctx, pnn, nodemap, 
                                                     vnnmap);
-                                       goto again;
+                                       return;
                                 } else {
                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                         do_recovery(rec, mem_ctx, pnn, nodemap, 
                                                     vnnmap);
-                                       goto again;
+                                       return;
                                 }
                         }
                 }
@@ -3155,7 +3311,7 @@ again:
                           vnnmap->size, rec->num_active));
                 ctdb_set_culprit(rec, ctdb->pnn);
                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-               goto again;
+               return;
         }
  
         /* verify that all active nodes in the nodemap also exist in 
@@ -3179,7 +3335,7 @@ again:
                                   nodemap->nodes[j].pnn));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
         }
  
@@ -3200,7 +3356,7 @@ again:
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n", 
                                   nodemap->nodes[j].pnn));
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap generation is the same */
@@ -3209,7 +3365,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap size is the same */
@@ -3218,7 +3374,7 @@ again:
                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
-                       goto again;
+                       return;
                 }
  
                 /* verify the vnnmap is the same */
@@ -3229,28 +3385,41 @@ again:
                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
                                 do_recovery(rec, mem_ctx, pnn, nodemap, 
                                             vnnmap);
-                               goto again;
+                               return;
                         }
                 }
         }
  
         /* we might need to change who has what IP assigned */
         if (rec->need_takeover_run) {
+               uint32_t culprit = (uint32_t)-1;
+
                 rec->need_takeover_run = false;
  
+               /* update the list of public ips that a node can handle for
+                  all connected nodes
+               */
+               ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                        culprit));
+                       rec->need_takeover_run = true;
+                       return;
+               }
+
                 /* execute the "startrecovery" event script on all nodes */
                 ret = run_startrecovery_eventscript(rec, nodemap);
                 if (ret!=0) {
                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
                         ctdb_set_culprit(rec, ctdb->pnn);
                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
                 }
  
                 ret = ctdb_takeover_run(ctdb, nodemap);
                 if (ret != 0) {
-                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
-                       ctdb_set_culprit(rec, ctdb->pnn);
-                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
+                       return;
                 }
  
                 /* execute the "recovered" event script on all nodes */
@@ -3267,10 +3436,73 @@ again:
                 }
  #endif
         }
+}
+
+/*
+  the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+       struct ctdb_recoverd *rec;
+
+       DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+       rec = talloc_zero(ctdb, struct ctdb_recoverd);
+       CTDB_NO_MEMORY_FATAL(ctdb, rec);
+
+       rec->ctdb = ctdb;
+
+       rec->priority_time = timeval_current();
+
+       /* register a message port for sending memory dumps */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+       /* register a message port for recovery elections */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
+
+       /* when nodes are disabled/enabled */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+
+       /* when we are asked to puch out a flag change */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+       /* register a message port for vacuum fetch */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
  
+       /* register a message port for reloadnodes  */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+       /* register a message port for performing a takeover run */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+       /* register a message port for disabling the ip check for a short while */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+       /* register a message port for updating the recovery daemons node assignment for an ip */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
+
+       for (;;) {
+               TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+               struct timeval start;
+               double elapsed;
+
+               if (!mem_ctx) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         " Failed to create temp context\n"));
+                       exit(-1);
+               }
  
-       goto again;
+               start = timeval_current();
+               main_loop(ctdb, rec, mem_ctx);
+               talloc_free(mem_ctx);
  
+               /* we only check for recovery once every second */
+               elapsed = timeval_elapsed(&start);
+               if (elapsed < ctdb->tunable.recover_interval) {
+                       ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+                                         - elapsed);
+               }
+       }
  }
  
  /*
@@ -3292,18 +3524,12 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
  
         if (kill(ctdb->recoverd_pid, 0) != 0) {
-               DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
+               DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
  
-               ctdb_stop_recoverd(ctdb);
-               ctdb_stop_keepalive(ctdb);
-               ctdb_stop_monitoring(ctdb);
-               ctdb_release_all_ips(ctdb);
-               if (ctdb->methods != NULL) {
-                       ctdb->methods->shutdown(ctdb);
-               }
-               ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+               event_add_timed(ctdb->ev, ctdb, timeval_zero(), 
+                               ctdb_restart_recd, ctdb);
  
-               exit(10);       
+               return;
         }
  
         event_add_timed(ctdb->ev, ctdb, 
@@ -3341,6 +3567,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
  {
         int fd[2];
         struct signal_event *se;
+       struct tevent_fd *fde;
  
         if (pipe(fd) != 0) {
                 return -1;
@@ -3365,15 +3592,16 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
  
         srandom(getpid() ^ time(NULL));
  
-       if (switch_from_server_to_client(ctdb) != 0) {
+       if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
                 exit(1);
         }
  
-       DEBUG(DEBUG_NOTICE, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
  
-       event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE, 
+       fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
                      ctdb_recoverd_parent, &fd[0]);     
+       tevent_fd_set_auto_close(fde);
  
         /* set up a handler to pick up sigchld */
         se = event_add_signal(ctdb->ev, ctdb,
@@ -3403,3 +3631,13 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb)
         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
         kill(ctdb->recoverd_pid, SIGTERM);
  }
+
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+       ctdb_stop_recoverd(ctdb);
+       ctdb_start_recoverd(ctdb);
+}