Merge branch 'master-readonly-records' into foo
[sahlberg/ctdb.git] / server / ctdb_recoverd.c
index 46321dd7fe66cee7ea092a13ed5d1d33f70c1b60..631f53e89bd4d04908b4b3abc8b1da761d86163a 100644 (file)
@@ -18,7 +18,7 @@
 */
 
 #include "includes.h"
 */
 
 #include "includes.h"
-#include "lib/events/events.h"
+#include "lib/tevent/tevent.h"
 #include "system/filesys.h"
 #include "system/time.h"
 #include "system/network.h"
 #include "system/filesys.h"
 #include "system/time.h"
 #include "system/network.h"
@@ -70,6 +70,7 @@ struct ctdb_recoverd {
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 
 /*
   ban a node for a period of time
 
 /*
   ban a node for a period of time
@@ -438,7 +439,8 @@ static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctd
                                return -1;
                        }
                        ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
                                return -1;
                        }
                        ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
-                                          mem_ctx, name, dbmap->dbs[db].persistent);
+                                          mem_ctx, name,
+                                          dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
                        if (ret != 0) {
                                DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
                                return -1;
                        if (ret != 0) {
                                DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
                                return -1;
@@ -501,7 +503,7 @@ static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb
                                return -1;
                        }
                        ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name, 
                                return -1;
                        }
                        ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name, 
-                                          remote_dbmap->dbs[db].persistent);
+                                          remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
                        if (ret != 0) {
                                DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
                                return -1;
                        if (ret != 0) {
                                DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
                                return -1;
@@ -713,6 +715,7 @@ static void vacuum_fetch_next(struct vacuum_info *v)
                ZERO_STRUCT(call);
                call.call_id = CTDB_NULL_FUNC;
                call.flags = CTDB_IMMEDIATE_MIGRATION;
                ZERO_STRUCT(call);
                call.call_id = CTDB_NULL_FUNC;
                call.flags = CTDB_IMMEDIATE_MIGRATION;
+               call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 
                r = v->r;
                v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 
                r = v->r;
                v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
@@ -821,7 +824,7 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 
        for (i=0;i<dbmap->num;i++) {
                if (dbmap->dbs[i].dbid == recs->db_id) {
 
        for (i=0;i<dbmap->num;i++) {
                if (dbmap->dbs[i].dbid == recs->db_id) {
-                       persistent = dbmap->dbs[i].persistent;
+                       persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
                        break;
                }
        }
                        break;
                }
        }
@@ -839,7 +842,7 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
        }
 
        /* attach to it */
        }
 
        /* attach to it */
-       ctdb_db = ctdb_attach(ctdb, name, persistent, 0);
+       ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
        if (ctdb_db == NULL) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
                talloc_free(tmp_ctx);
        if (ctdb_db == NULL) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
                talloc_free(tmp_ctx);
@@ -905,6 +908,7 @@ static void ctdb_election_timeout(struct event_context *ev, struct timed_event *
 {
        struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
        rec->election_timeout = NULL;
 {
        struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
        rec->election_timeout = NULL;
+       fast_start = false;
 
        DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
 }
 
        DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
 }
@@ -1064,6 +1068,7 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
        hdr = (struct ctdb_ltdb_header *)data.dptr;
        if (!params->persistent) {
                hdr->dmaster = params->ctdb->pnn;
        hdr = (struct ctdb_ltdb_header *)data.dptr;
        if (!params->persistent) {
                hdr->dmaster = params->ctdb->pnn;
+               hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
        }
 
        /* add the record to the blob ready to send to the nodes */
        }
 
        /* add the record to the blob ready to send to the nodes */
@@ -1276,10 +1281,12 @@ static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
                        return -1;
                }
 
                        return -1;
                }
 
-               if (rec->ip_check_disable_ctx == NULL) {
-                       if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
-                               DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
-                               rec->need_takeover_run = true;
+               if (ctdb->tunable.disable_ip_failover == 0) {
+                       if (rec->ip_check_disable_ctx == NULL) {
+                               if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
+                                       DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
+                                       rec->need_takeover_run = true;
+                               }
                        }
                }
 
                        }
                }
 
@@ -1381,8 +1388,10 @@ static int do_recovery(struct ctdb_recoverd *rec,
                DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
                start_time = timeval_current();
                if (!ctdb_recovery_lock(ctdb, true)) {
                DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
                start_time = timeval_current();
                if (!ctdb_recovery_lock(ctdb, true)) {
-                       ctdb_set_culprit(rec, pnn);
-                       DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n"));
+                       DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
+                                        "and ban ourself for %u seconds\n",
+                                        ctdb->tunable.recovery_ban_period));
+                       ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
                        return -1;
                }
                ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
                        return -1;
                }
                ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
@@ -1507,7 +1516,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
        for (i=0;i<dbmap->num;i++) {
                ret = recover_database(rec, mem_ctx,
                                       dbmap->dbs[i].dbid,
        for (i=0;i<dbmap->num;i++) {
                ret = recover_database(rec, mem_ctx,
                                       dbmap->dbs[i].dbid,
-                                      dbmap->dbs[i].persistent,
+                                      dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
                                       pnn, nodemap, generation);
                if (ret != 0) {
                        DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
                                       pnn, nodemap, generation);
                if (ret != 0) {
                        DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
@@ -1621,15 +1630,15 @@ static int do_recovery(struct ctdb_recoverd *rec,
        if (ret != 0) {
                DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
                                 culprit));
        if (ret != 0) {
                DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
                                 culprit));
+               rec->need_takeover_run = true;
                return -1;
        }
        rec->need_takeover_run = false;
        ret = ctdb_takeover_run(ctdb, nodemap);
        if (ret != 0) {
                return -1;
        }
        rec->need_takeover_run = false;
        ret = ctdb_takeover_run(ctdb, nodemap);
        if (ret != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses\n"));
-               return -1;
+               DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
+               rec->need_takeover_run = true;
        }
        }
-       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
 
        /* execute the "recovered" event script on all nodes */
        ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
 
        /* execute the "recovered" event script on all nodes */
        ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
@@ -1673,9 +1682,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
           We now wait for rerecovery_timeout before we allow 
           another recovery to take place.
        */
           We now wait for rerecovery_timeout before we allow 
           another recovery to take place.
        */
-       DEBUG(DEBUG_NOTICE, (__location__ " New recoveries supressed for the rerecovery timeout\n"));
+       DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
        ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
        ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
-       DEBUG(DEBUG_NOTICE, (__location__ " Rerecovery timeout elapsed. Recovery reactivated.\n"));
+       DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
 
        return 0;
 }
 
        return 0;
 }
@@ -2042,8 +2051,7 @@ static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb
        if (ret == 0) {
                ret = ctdb_takeover_run(ctdb, rec->nodemap);
                if (ret != 0) {
        if (ret == 0) {
                ret = ctdb_takeover_run(ctdb, rec->nodemap);
                if (ret != 0) {
-                       DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
-                                        culprit));
+                       DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
                        rec->need_takeover_run = true;
                }
        }
                        rec->need_takeover_run = true;
                }
        }
@@ -2091,6 +2099,8 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
        /* we got an election packet - update the timeout for the election */
        talloc_free(rec->election_timeout);
        rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
        /* we got an election packet - update the timeout for the election */
        talloc_free(rec->election_timeout);
        rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                ctdb_election_timeout, rec);
 
                                                timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                ctdb_election_timeout, rec);
 
@@ -2158,6 +2168,8 @@ static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
 
        talloc_free(rec->election_timeout);
        rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
 
        talloc_free(rec->election_timeout);
        rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
                                                timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                ctdb_election_timeout, rec);
 
                                                timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
                                                ctdb_election_timeout, rec);
 
@@ -2488,7 +2500,7 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct
 
 /* called to check that the local allocation of public ip addresses is ok.
 */
 
 /* called to check that the local allocation of public ip addresses is ok.
 */
-static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn)
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
 {
        TALLOC_CTX *mem_ctx = talloc_new(NULL);
        struct ctdb_control_get_ifaces *ifaces = NULL;
 {
        TALLOC_CTX *mem_ctx = talloc_new(NULL);
        struct ctdb_control_get_ifaces *ifaces = NULL;
@@ -2566,7 +2578,7 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec
        /* skip the check if we have started but not finished recovery */
        if (timeval_compare(&uptime1->last_recovery_finished,
                            &uptime1->last_recovery_started) != 1) {
        /* skip the check if we have started but not finished recovery */
        if (timeval_compare(&uptime1->last_recovery_finished,
                            &uptime1->last_recovery_started) != 1) {
-               DEBUG(DEBUG_NOTICE, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
+               DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
                talloc_free(mem_ctx);
 
                return 0;
                talloc_free(mem_ctx);
 
                return 0;
@@ -2579,20 +2591,28 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec
           and we dont have ones we shouldnt have.
           if we find an inconsistency we set recmode to
           active on the local node and wait for the recmaster
           and we dont have ones we shouldnt have.
           if we find an inconsistency we set recmode to
           active on the local node and wait for the recmaster
-          to do a full blown recovery
+          to do a full blown recovery.
+          also if the pnn is -1 and we are healthy and can host the ip
+          we also request a ip reallocation.
        */
        */
-       for (j=0; j<ips->num; j++) {
-               if (ips->ips[j].pnn == pnn) {
-                       if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
-                               need_takeover_run = true;
-                       }
-               } else {
-                       if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
-                               DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
-                                       ctdb_addr_to_str(&ips->ips[j].addr)));
+       if (ctdb->tunable.disable_ip_failover == 0) {
+               for (j=0; j<ips->num; j++) {
+                       if (ips->ips[j].pnn == -1 && nodemap->nodes[pnn].flags == 0) {
+                               DEBUG(DEBUG_CRIT,("Public address '%s' is not assigned and we could serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
                                need_takeover_run = true;
                                need_takeover_run = true;
+                       } else if (ips->ips[j].pnn == pnn) {
+                               if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("Public address '%s' is missing and we should serve this ip\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
+                               }
+                       } else {
+                               if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("We are still serving a public address '%s' that we should not be serving.\n", 
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
+                               }
                        }
                }
        }
                        }
                }
        }
@@ -2753,7 +2773,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                return -1;
        }
 
                return -1;
        }
 
-       state->child = fork();
+       state->child = ctdb_fork(ctdb);
        if (state->child == (pid_t)-1) {
                DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
                close(state->fd[0]);
        if (state->child == (pid_t)-1) {
                DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
                close(state->fd[0]);
@@ -2769,6 +2789,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                close(state->fd[0]);
                state->fd[0] = -1;
 
                close(state->fd[0]);
                state->fd[0] = -1;
 
+               debug_extra = talloc_asprintf(NULL, "recovery-lock:");
                if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
                        DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
                        cc = RECLOCK_FAILED;
                if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
                        DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
                        cc = RECLOCK_FAILED;
@@ -2799,7 +2820,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
        }
 
        state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
        }
 
        state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
-                               EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
+                               EVENT_FD_READ,
                                reclock_child_handler,
                                (void *)state);
 
                                reclock_child_handler,
                                (void *)state);
 
@@ -2808,6 +2829,7 @@ static int check_recovery_lock(struct ctdb_context *ctdb)
                talloc_free(state);
                return -1;
        }
                talloc_free(state);
                return -1;
        }
+       tevent_fd_set_auto_close(state->fde);
 
        while (state->status == RECLOCK_CHECKING) {
                event_loop_once(ctdb->ev);
 
        while (state->status == RECLOCK_CHECKING) {
                event_loop_once(ctdb->ev);
@@ -3002,10 +3024,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                        rec->reallocate_callers = NULL;
                }
        }
                        rec->reallocate_callers = NULL;
                }
        }
-       /* if there are takeovers requested, perform it and notify the waiters */
-       if (rec->reallocate_callers) {
-               process_ipreallocate_requests(ctdb, rec);
-       }
 
        if (rec->recmaster == (uint32_t)-1) {
                DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
 
        if (rec->recmaster == (uint32_t)-1) {
                DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
@@ -3109,9 +3127,9 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
        /* verify that we have all ip addresses we should have and we dont
         * have addresses we shouldnt have.
         */ 
        /* verify that we have all ip addresses we should have and we dont
         * have addresses we shouldnt have.
         */ 
-       if (ctdb->do_checkpublicip) {
+       if (ctdb->tunable.disable_ip_failover == 0) {
                if (rec->ip_check_disable_ctx == NULL) {
                if (rec->ip_check_disable_ctx == NULL) {
-                       if (verify_local_ip_allocation(ctdb, rec, pnn) != 0) {
+                       if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
                                DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
                        }
                }
                                DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
                        }
                }
@@ -3192,6 +3210,11 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                }
        }
 
                }
        }
 
+       /* if there are takeovers requested, perform it and notify the waiters */
+       if (rec->reallocate_callers) {
+               process_ipreallocate_requests(ctdb, rec);
+       }
+
        /* get the nodemap for all active remote nodes
         */
        remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
        /* get the nodemap for all active remote nodes
         */
        remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
@@ -3380,8 +3403,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
                if (ret != 0) {
                        DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
                                         culprit));
                if (ret != 0) {
                        DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
                                         culprit));
-                       ctdb_set_culprit(rec, culprit);
-                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       rec->need_takeover_run = true;
                        return;
                }
 
                        return;
                }
 
@@ -3396,9 +3418,7 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
 
                ret = ctdb_takeover_run(ctdb, nodemap);
                if (ret != 0) {
 
                ret = ctdb_takeover_run(ctdb, nodemap);
                if (ret != 0) {
-                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
-                       ctdb_set_culprit(rec, ctdb->pnn);
-                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Try again later\n"));
                        return;
                }
 
                        return;
                }
 
@@ -3504,18 +3524,12 @@ static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
        struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
 
        if (kill(ctdb->recoverd_pid, 0) != 0) {
        struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
 
        if (kill(ctdb->recoverd_pid, 0) != 0) {
-               DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Shutting down main daemon\n", (int)ctdb->recoverd_pid));
+               DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
 
 
-               ctdb_stop_recoverd(ctdb);
-               ctdb_stop_keepalive(ctdb);
-               ctdb_stop_monitoring(ctdb);
-               ctdb_release_all_ips(ctdb);
-               if (ctdb->methods != NULL) {
-                       ctdb->methods->shutdown(ctdb);
-               }
-               ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+               event_add_timed(ctdb->ev, ctdb, timeval_zero(), 
+                               ctdb_restart_recd, ctdb);
 
 
-               exit(10);       
+               return;
        }
 
        event_add_timed(ctdb->ev, ctdb, 
        }
 
        event_add_timed(ctdb->ev, ctdb, 
@@ -3553,6 +3567,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 {
        int fd[2];
        struct signal_event *se;
 {
        int fd[2];
        struct signal_event *se;
+       struct tevent_fd *fde;
 
        if (pipe(fd) != 0) {
                return -1;
 
        if (pipe(fd) != 0) {
                return -1;
@@ -3577,15 +3592,16 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 
        srandom(getpid() ^ time(NULL));
 
 
        srandom(getpid() ^ time(NULL));
 
-       if (switch_from_server_to_client(ctdb) != 0) {
+       if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
                DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
                exit(1);
        }
 
        DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
 
                DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
                exit(1);
        }
 
        DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
 
-       event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE, 
+       fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
                     ctdb_recoverd_parent, &fd[0]);     
                     ctdb_recoverd_parent, &fd[0]);     
+       tevent_fd_set_auto_close(fde);
 
        /* set up a handler to pick up sigchld */
        se = event_add_signal(ctdb->ev, ctdb,
 
        /* set up a handler to pick up sigchld */
        se = event_add_signal(ctdb->ev, ctdb,
@@ -3615,3 +3631,13 @@ void ctdb_stop_recoverd(struct ctdb_context *ctdb)
        DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
        kill(ctdb->recoverd_pid, SIGTERM);
 }
        DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
        kill(ctdb->recoverd_pid, SIGTERM);
 }
+
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+       ctdb_stop_recoverd(ctdb);
+       ctdb_start_recoverd(ctdb);
+}