- merge from ronnie
[ambi/samba-autobuild/.git] / ctdb / server / ctdb_recoverd.c
index c39cfda68d1a6b4ecf9d18b61c10a11bbd303b08..b0f36679d97b53f0f780f017ce0e7aeafa36bd08 100644 (file)
@@ -43,6 +43,7 @@ struct ctdb_recoverd {
        struct ban_state **banned_nodes;
        struct timeval priority_time;
        bool need_takeover_run;
+       bool need_recovery;
 };
 
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -731,6 +732,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
        uint32_t generation;
        struct ctdb_dbid_map *dbmap;
 
+       /* if recovery fails, force it again */
+       rec->need_recovery = true;
+
        if (rec->last_culprit != culprit ||
            timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
                /* either a new node is the culprit, or we've decide to forgive them */
@@ -909,7 +913,6 @@ static int do_recovery(struct ctdb_recoverd *rec,
                ret = ctdb_takeover_run(ctdb, nodemap);
                if (ret != 0) {
                        DEBUG(0, (__location__ " Unable to setup public takeover addresses\n"));
-                       rec->need_takeover_run = true;
                        return -1;
                }
                DEBUG(1, (__location__ " Recovery - done takeover\n"));
@@ -929,6 +932,8 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
        DEBUG(0, (__location__ " Recovery complete\n"));
 
+       rec->need_recovery = false;
+
        /* We just finished a recovery successfully. 
           We now wait for rerecovery_timeout before we allow 
           another recovery to take place.
@@ -1577,6 +1582,12 @@ again:
        }
 
 
+       if (rec->need_recovery) {
+               /* a previous recovery didn't finish */
+               do_recovery(rec, mem_ctx, pnn, num_active, nodemap, vnnmap, nodemap->nodes[j].pnn);
+               goto again;             
+       }
+
        /* verify that all active nodes are in normal mode 
           and not in recovery mode 
         */
@@ -1746,7 +1757,6 @@ again:
                ret = ctdb_takeover_run(ctdb, nodemap);
                if (ret != 0) {
                        DEBUG(0, (__location__ " Unable to setup public takeover addresses - starting recovery\n"));
-                       rec->need_takeover_run = true;
                        do_recovery(rec, mem_ctx, pnn, num_active, nodemap, 
                                    vnnmap, nodemap->nodes[j].pnn);
                }