ctdb/daemon: Make delete IP wait until the IP is released

[garming/samba-autobuild/.git] / ctdb / server / ctdb_takeover.c
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c

index be64003c7a277bf7b4c81680d82e2b0c00f71ce4..9c699be4fff8b7208550b05bdbb32c4aacf954d5 100644 (file)
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -19,7 +19,7 @@
     along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
  #include "includes.h"
-#include "lib/tdb/include/tdb.h"
+#include "tdb.h"
  #include "lib/util/dlinklist.h"
  #include "system/network.h"
  #include "system/filesys.h"
@@ -33,6 +33,12 @@
  #define CTDB_ARP_INTERVAL 1
  #define CTDB_ARP_REPEAT   3
  
+/* Flags used in IP allocation algorithms. */
+struct ctdb_ipflags {
+       bool noiptakeover;
+       bool noiphost;
+};
+
  struct ctdb_iface {
         struct ctdb_iface *prev, *next;
         const char *name;
@@ -78,7 +84,7 @@ static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
          * IPs can't be assigned, and after startup IPs can be
          * assigned immediately.
          */
-       i->link_up = ctdb->done_startup;
+       i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
  
         DLIST_ADD(ctdb->ifaces, i);
  
@@ -158,7 +164,6 @@ static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
  {
         struct ctdb_iface *i;
  
-       /* Verify that we dont have an entry for this ip yet */
         for (i=ctdb->ifaces;i;i=i->next) {
                 if (strcmp(i->name, iface) == 0) {
                         return i;
@@ -255,6 +260,10 @@ static bool ctdb_vnn_available(struct ctdb_context *ctdb,
  {
         int i;
  
+       if (vnn->delete_pending) {
+               return false;
+       }
+
         if (vnn->iface && vnn->iface->link_up) {
                 return true;
         }
@@ -505,7 +514,6 @@ static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
                                          state,
                                          ctdb_do_takeip_callback,
                                          state,
-                                        false,
                                          CTDB_EVENT_TAKE_IP,
                                          "%s %s %u",
                                          ctdb_vnn_iface_string(vnn),
@@ -646,7 +654,6 @@ static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
                                          state,
                                          ctdb_do_updateip_callback,
                                          state,
-                                        false,
                                          CTDB_EVENT_UPDATE_IP,
                                          "%s %s %s %u",
                                          state->old->name,
@@ -856,12 +863,23 @@ static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr
                                         (unsigned)client->pid,
                                         ctdb_addr_to_str(addr),
                                         ip->client_id));
-                               ctdb_kill(ctdb, client->pid, SIGKILL);
+                               kill(client->pid, SIGKILL);
                         }
                 }
         }
  }
  
+static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
+{
+       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+
+       DLIST_REMOVE(ctdb->vnn, vnn);
+       ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
+       ctdb_vnn_unassign_iface(ctdb, vnn);
+       talloc_free(vnn);
+       talloc_free(mem_ctx);
+}
+
  /*
    called when releaseip event finishes
   */
@@ -876,6 +894,14 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
                 ctdb_ban_self(ctdb);
         }
  
+       if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
+               DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
+                                 ctdb_addr_to_str(state->addr)));
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
+       }
+
         /* send a message to all clients of this node telling them
            that the cluster has been reconfigured and they should
            release any sockets on this IP */
@@ -892,6 +918,12 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
  
         ctdb_vnn_unassign_iface(ctdb, state->vnn);
  
+       /* Process the IP if it has been marked for deletion */
+       if (state->vnn->delete_pending) {
+               do_delete_ip(ctdb, state->vnn);
+               state->vnn = NULL;
+       }
+
         /* the control succeeded */
         ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
         talloc_free(state);
@@ -899,7 +931,9 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
  
  static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
  {
-       state->vnn->update_in_flight = false;
+       if (state->vnn != NULL) {
+               state->vnn->update_in_flight = false;
+       }
         return 0;
  }
  
@@ -972,6 +1006,21 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
                         DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
                         return 0;
                 }
+               if (vnn->iface == NULL) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Public IP %s is hosted on interface %s but we have no VNN\n",
+                              ctdb_addr_to_str(&pip->addr),
+                              iface));
+               } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
+                              ctdb_addr_to_str(&pip->addr),
+                              iface,
+                              ctdb_vnn_iface_string(vnn)));
+                       /* Should we fix vnn->iface?  If we do, what
+                        * happens to reference counts?
+                        */
+               }
         } else {
                 iface = strdup(ctdb_vnn_iface_string(vnn));
         }
@@ -996,7 +1045,6 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
  
         ret = ctdb_event_script_callback(ctdb, 
                                          state, release_ip_callback, state,
-                                        false,
                                          CTDB_EVENT_RELEASE_IP,
                                          "%s %s %u",
                                          iface,
@@ -1107,16 +1155,6 @@ static int ctdb_add_public_address(struct ctdb_context *ctdb,
         return 0;
  }
  
-/*
-  setup the event script directory
-*/
-int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
-{
-       ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
-       CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
-       return 0;
-}
-
  static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
                                   struct timeval t, void *private_data)
  {
@@ -1281,6 +1319,12 @@ int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
         return 0;
  }
  
+struct ctdb_public_ip_list {
+       struct ctdb_public_ip_list *next;
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+};
+
  /* Given a physical node, return the number of
     public addresses that is currently assigned to this node.
  */
@@ -1303,13 +1347,13 @@ static int node_ip_coverage(struct ctdb_context *ctdb,
   * node and is NOIPHOST unset?
  */
  static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
-                            struct ctdb_node_map *nodemap,
+                            struct ctdb_ipflags ipflags,
                              struct ctdb_public_ip_list *ip)
  {
         struct ctdb_all_public_ips *public_ips;
         int i;
  
-       if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPHOST) {
+       if (ipflags.noiphost) {
                 return false;
         }
  
@@ -1319,7 +1363,7 @@ static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
                 return false;
         }
  
-       for (i=0;i<public_ips->num;i++) {
+       for (i=0; i<public_ips->num; i++) {
                 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
                         /* yes, this node can serve this public ip */
                         return true;
@@ -1330,14 +1374,14 @@ static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
  }
  
  static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
-                                struct ctdb_node_map *nodemap,
+                                struct ctdb_ipflags ipflags,
                                  struct ctdb_public_ip_list *ip)
  {
-       if (nodemap->nodes[pnn].flags & NODE_FLAGS_NOIPTAKEOVER) {
+       if (ipflags.noiptakeover) {
                 return false;
         }
  
-       return can_node_host_ip(ctdb, pnn, nodemap, ip);
+       return can_node_host_ip(ctdb, pnn, ipflags, ip);
  }
  
  /* search the node lists list for a node to takeover this ip.
@@ -1345,17 +1389,18 @@ static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
     so that the ips get spread out evenly.
  */
  static int find_takeover_node(struct ctdb_context *ctdb, 
-               struct ctdb_node_map *nodemap,
+               struct ctdb_ipflags *ipflags,
                 struct ctdb_public_ip_list *ip,
                 struct ctdb_public_ip_list *all_ips)
  {
         int pnn, min=0, num;
-       int i;
+       int i, numnodes;
  
+       numnodes = talloc_array_length(ipflags);
         pnn    = -1;
-       for (i=0;i<nodemap->num;i++) {
+       for (i=0; i<numnodes; i++) {
                 /* verify that this node can serve this ip */
-               if (!can_node_takeover_ip(ctdb, i, nodemap, ip)) {
+               if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
                         /* no it couldnt   so skip to the next node */
                         continue;
                 }
@@ -1591,7 +1636,7 @@ static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
   * finding the best node for each.
   */
  static void basic_allocate_unassigned(struct ctdb_context *ctdb,
-                                     struct ctdb_node_map *nodemap,
+                                     struct ctdb_ipflags *ipflags,
                                       struct ctdb_public_ip_list *all_ips)
  {
         struct ctdb_public_ip_list *tmp_ip;
@@ -1601,7 +1646,7 @@ static void basic_allocate_unassigned(struct ctdb_context *ctdb,
         */
         for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
                 if (tmp_ip->pnn == -1) {
-                       if (find_takeover_node(ctdb, nodemap, tmp_ip, all_ips)) {
+                       if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
                                 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
                                         ctdb_addr_to_str(&tmp_ip->addr)));
                         }
@@ -1612,14 +1657,15 @@ static void basic_allocate_unassigned(struct ctdb_context *ctdb,
  /* Basic non-deterministic rebalancing algorithm.
   */
  static void basic_failback(struct ctdb_context *ctdb,
-                          struct ctdb_node_map *nodemap,
+                          struct ctdb_ipflags *ipflags,
                            struct ctdb_public_ip_list *all_ips,
                            int num_ips)
  {
-       int i;
+       int i, numnodes;
         int maxnode, maxnum, minnode, minnum, num, retries;
         struct ctdb_public_ip_list *tmp_ip;
  
+       numnodes = talloc_array_length(ipflags);
         retries = 0;
  
  try_again:
@@ -1641,9 +1687,9 @@ try_again:
                 */
                 maxnode = -1;
                 minnode = -1;
-               for (i=0;i<nodemap->num;i++) {
+               for (i=0; i<numnodes; i++) {
                         /* only check nodes that can actually serve this ip */
-                       if (!can_node_takeover_ip(ctdb, i, nodemap, tmp_ip)) {
+                       if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
                                 /* no it couldnt   so skip to the next node */
                                 continue;
                         }
@@ -1688,7 +1734,7 @@ try_again:
                         /* Reassign one of maxnode's VNNs */
                         for (tmp=all_ips;tmp;tmp=tmp->next) {
                                 if (tmp->pnn == maxnode) {
-                                       (void)find_takeover_node(ctdb, nodemap, tmp, all_ips);
+                                       (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
                                         retries++;
                                         goto try_again;;
                                 }
@@ -1697,51 +1743,24 @@ try_again:
         }
  }
  
-struct ctdb_rebalancenodes {
-       struct ctdb_rebalancenodes *next;
-       uint32_t pnn;
-};
-static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
-
-
-/* set this flag to force the node to be rebalanced even if it just didnt
-   become healthy again.
-*/
-void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
-{
-       struct ctdb_rebalancenodes *rebalance;
-
-       for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
-               if (rebalance->pnn == pnn) {
-                       return;
-               }
-       }
-
-       rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
-       rebalance->pnn = pnn;
-       rebalance->next = force_rebalance_list;
-       force_rebalance_list = rebalance;
-}
-
-/* Do necessary LCP2 initialisation.  Bury it in a function here so
- * that we can unit test it.
- */
-static void lcp2_init(struct ctdb_context * tmp_ctx,
-              struct ctdb_node_map * nodemap,
-              uint32_t mask,
-              struct ctdb_public_ip_list *all_ips,
-              uint32_t **lcp2_imbalances,
-              bool **rebalance_candidates)
+static void lcp2_init(struct ctdb_context *tmp_ctx,
+                     struct ctdb_ipflags *ipflags,
+                     struct ctdb_public_ip_list *all_ips,
+                     uint32_t *force_rebalance_nodes,
+                     uint32_t **lcp2_imbalances,
+                     bool **rebalance_candidates)
  {
-       int i;
+       int i, numnodes;
         struct ctdb_public_ip_list *tmp_ip;
  
-       *rebalance_candidates = talloc_array(tmp_ctx, bool, nodemap->num);
+       numnodes = talloc_array_length(ipflags);
+
+       *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
         CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
-       *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+       *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
         CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
  
-       for (i=0;i<nodemap->num;i++) {
+       for (i=0; i<numnodes; i++) {
                 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
                 /* First step: assume all nodes are candidates */
                 (*rebalance_candidates)[i] = true;
@@ -1762,16 +1781,20 @@ static void lcp2_init(struct ctdb_context * tmp_ctx,
  
         /* 3rd step: if a node is forced to re-balance then
            we allow failback onto the node */
-       while (force_rebalance_list != NULL) {
-               struct ctdb_rebalancenodes *next = force_rebalance_list->next;
-
-               if (force_rebalance_list->pnn <= nodemap->num) {
-                       (*rebalance_candidates)[force_rebalance_list->pnn] = true;
+       if (force_rebalance_nodes == NULL) {
+               return;
+       }
+       for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
+               uint32_t pnn = force_rebalance_nodes[i];
+               if (pnn >= numnodes) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ "unknown node %u\n", pnn));
+                       continue;
                 }
  
-               DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
-               talloc_free(force_rebalance_list);
-               force_rebalance_list = next;
+               DEBUG(DEBUG_NOTICE,
+                     ("Forcing rebalancing of IPs to node %u\n", pnn));
+               (*rebalance_candidates)[pnn] = true;
         }
  }
  
@@ -1779,12 +1802,12 @@ static void lcp2_init(struct ctdb_context * tmp_ctx,
   * the IP/node combination that will cost the least.
   */
  static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
-                                    struct ctdb_node_map *nodemap,
+                                    struct ctdb_ipflags *ipflags,
                                      struct ctdb_public_ip_list *all_ips,
                                      uint32_t *lcp2_imbalances)
  {
         struct ctdb_public_ip_list *tmp_ip;
-       int dstnode;
+       int dstnode, numnodes;
  
         int minnode;
         uint32_t mindsum, dstdsum, dstimbl, minimbl;
@@ -1793,6 +1816,8 @@ static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
         bool should_loop = true;
         bool have_unassigned = true;
  
+       numnodes = talloc_array_length(ipflags);
+
         while (have_unassigned && should_loop) {
                 should_loop = false;
  
@@ -1809,10 +1834,11 @@ static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
                                 continue;
                         }
  
-                       for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+                       for (dstnode=0; dstnode<numnodes; dstnode++) {
                                 /* only check nodes that can actually takeover this ip */
                                 if (!can_node_takeover_ip(ctdb, dstnode,
-                                                         nodemap, tmp_ip)) {
+                                                         ipflags[dstnode],
+                                                         tmp_ip)) {
                                         /* no it couldnt   so skip to the next node */
                                         continue;
                                 }
@@ -1874,27 +1900,30 @@ static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
   * combination to move from the source node.
   */
  static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
-                                   struct ctdb_node_map *nodemap,
+                                   struct ctdb_ipflags *ipflags,
                                     struct ctdb_public_ip_list *all_ips,
                                     int srcnode,
-                                   uint32_t candimbl,
                                     uint32_t *lcp2_imbalances,
                                     bool *rebalance_candidates)
  {
-       int dstnode, mindstnode;
+       int dstnode, mindstnode, numnodes;
         uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
         uint32_t minsrcimbl, mindstimbl;
         struct ctdb_public_ip_list *minip;
         struct ctdb_public_ip_list *tmp_ip;
  
         /* Find an IP and destination node that best reduces imbalance. */
+       srcimbl = 0;
         minip = NULL;
         minsrcimbl = 0;
         mindstnode = -1;
         mindstimbl = 0;
  
+       numnodes = talloc_array_length(ipflags);
+
         DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
-       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
+       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
+                          srcnode, lcp2_imbalances[srcnode]));
  
         for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
                 /* Only consider addresses on srcnode. */
@@ -1904,7 +1933,7 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
  
                 /* What is this IP address costing the source node? */
                 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
-               srcimbl = candimbl - srcdsum;
+               srcimbl = lcp2_imbalances[srcnode] - srcdsum;
  
                 /* Consider this IP address would cost each potential
                  * destination node.  Destination nodes are limited to
@@ -1912,14 +1941,14 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
                  * to do gratuitous failover of IPs just to make minor
                  * balance improvements.
                  */
-               for (dstnode=0; dstnode < nodemap->num; dstnode++) {
+               for (dstnode=0; dstnode<numnodes; dstnode++) {
                         if (!rebalance_candidates[dstnode]) {
                                 continue;
                         }
  
                         /* only check nodes that can actually takeover this ip */
                         if (!can_node_takeover_ip(ctdb, dstnode,
-                                                 nodemap, tmp_ip)) {
+                                                 ipflags[dstnode], tmp_ip)) {
                                 /* no it couldnt   so skip to the next node */
                                 continue;
                         }
@@ -1927,11 +1956,12 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
                         dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
                         dstimbl = lcp2_imbalances[dstnode] + dstdsum;
                         DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
-                                          srcnode, srcimbl - lcp2_imbalances[srcnode],
+                                          srcnode, -srcdsum,
                                            ctdb_addr_to_str(&(tmp_ip->addr)),
-                                          dstnode, dstimbl - lcp2_imbalances[dstnode]));
+                                          dstnode, dstdsum));
  
-                       if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
+                       if ((dstimbl < lcp2_imbalances[srcnode]) &&
+                           (dstdsum < srcdsum) &&                      \
                             ((mindstnode == -1) ||                              \
                              ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
  
@@ -1952,7 +1982,7 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
                                   mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
  
  
-               lcp2_imbalances[srcnode] = srcimbl;
+               lcp2_imbalances[srcnode] = minsrcimbl;
                 lcp2_imbalances[mindstnode] = mindstimbl;
                 minip->pnn = mindstnode;
  
@@ -1987,45 +2017,35 @@ static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
   * IP/destination node combination to move from the source node.
   */
  static void lcp2_failback(struct ctdb_context *ctdb,
-                         struct ctdb_node_map *nodemap,
+                         struct ctdb_ipflags *ipflags,
                           struct ctdb_public_ip_list *all_ips,
                           uint32_t *lcp2_imbalances,
                           bool *rebalance_candidates)
  {
-       int i, num_rebalance_candidates;
+       int i, numnodes;
         struct lcp2_imbalance_pnn * lips;
         bool again;
  
-try_again:
-
-       /* It is only worth continuing if we have suitable target
-        * nodes to transfer IPs to.  This check is much cheaper than
-        * continuing on...
-        */
-       num_rebalance_candidates = 0;
-       for (i = 0; i < nodemap->num; i++) {
-               if (rebalance_candidates[i]) {
-                       num_rebalance_candidates++;
-               }
-       }
-       if (num_rebalance_candidates == 0) {
-               return;
-       }
+       numnodes = talloc_array_length(ipflags);
  
+try_again:
         /* Put the imbalances and nodes into an array, sort them and
          * iterate through candidates.  Usually the 1st one will be
          * used, so this doesn't cost much...
          */
-       lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, nodemap->num);
-       for (i = 0; i < nodemap->num; i++) {
+       DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
+       DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
+       lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
+       for (i=0; i<numnodes; i++) {
                 lips[i].imbalance = lcp2_imbalances[i];
                 lips[i].pnn = i;
+               DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
         }
-       qsort(lips, nodemap->num, sizeof(struct lcp2_imbalance_pnn),
+       qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
               lcp2_cmp_imbalance_pnn);
  
         again = false;
-       for (i = 0; i < nodemap->num; i++) {
+       for (i=0; i<numnodes; i++) {
                 /* This means that all nodes had 0 or 1 addresses, so
                  * can't be imbalanced.
                  */
@@ -2034,10 +2054,9 @@ try_again:
                 }
  
                 if (lcp2_failback_candidate(ctdb,
-                                           nodemap,
+                                           ipflags,
                                             all_ips,
                                             lips[i].pnn,
-                                           lips[i].imbalance,
                                             lcp2_imbalances,
                                             rebalance_candidates)) {
                         again = true;
@@ -2052,7 +2071,7 @@ try_again:
  }
  
  static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
-                                   struct ctdb_node_map *nodemap,
+                                   struct ctdb_ipflags *ipflags,
                                     struct ctdb_public_ip_list *all_ips)
  {
         struct ctdb_public_ip_list *tmp_ip;
@@ -2065,7 +2084,7 @@ static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
                         continue;
                 }
                 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
-                                     nodemap, tmp_ip) != 0) {
+                                     ipflags[tmp_ip->pnn], tmp_ip) != 0) {
                         /* this node can not serve this ip. */
                         DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
                                            ctdb_addr_to_str(&(tmp_ip->addr)),
@@ -2076,11 +2095,13 @@ static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
  }
  
  static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
-                                      struct ctdb_node_map *nodemap,
+                                      struct ctdb_ipflags *ipflags,
                                        struct ctdb_public_ip_list *all_ips)
  {
         struct ctdb_public_ip_list *tmp_ip;
-       int i;
+       int i, numnodes;
+
+       numnodes = talloc_array_length(ipflags);
  
         DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
         /* Allocate IPs to nodes in a modulo fashion so that IPs will
@@ -2089,7 +2110,7 @@ static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
         */
  
         for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
-               tmp_ip->pnn = i%nodemap->num;
+               tmp_ip->pnn = i % numnodes;
         }
  
         /* IP failback doesn't make sense with deterministic
@@ -2100,15 +2121,15 @@ static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
                 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
         }
  
-       unassign_unsuitable_ips(ctdb, nodemap, all_ips);
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
  
-       basic_allocate_unassigned(ctdb, nodemap, all_ips);
+       basic_allocate_unassigned(ctdb, ipflags, all_ips);
  
         /* No failback here! */
  }
  
  static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
-                                         struct ctdb_node_map *nodemap,
+                                         struct ctdb_ipflags *ipflags,
                                           struct ctdb_public_ip_list *all_ips)
  {
         /* This should be pushed down into basic_failback. */
@@ -2118,9 +2139,9 @@ static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
                 num_ips++;
         }
  
-       unassign_unsuitable_ips(ctdb, nodemap, all_ips);
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
  
-       basic_allocate_unassigned(ctdb, nodemap, all_ips);
+       basic_allocate_unassigned(ctdb, ipflags, all_ips);
  
         /* If we don't want IPs to fail back then don't rebalance IPs. */
         if (1 == ctdb->tunable.no_ip_failback) {
@@ -2130,35 +2151,51 @@ static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
         /* Now, try to make sure the ip adresses are evenly distributed
            across the nodes.
         */
-       basic_failback(ctdb, nodemap, all_ips, num_ips);
+       basic_failback(ctdb, ipflags, all_ips, num_ips);
  }
  
  static void ip_alloc_lcp2(struct ctdb_context *ctdb,
-                         struct ctdb_node_map *nodemap,
+                         struct ctdb_ipflags *ipflags,
                           struct ctdb_public_ip_list *all_ips,
-                         uint32_t mask)
+                         uint32_t *force_rebalance_nodes)
  {
         uint32_t *lcp2_imbalances;
         bool *rebalance_candidates;
+       int numnodes, num_rebalance_candidates, i;
  
         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
  
-       unassign_unsuitable_ips(ctdb, nodemap, all_ips);
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
  
-       lcp2_init(tmp_ctx, nodemap, mask, all_ips,
+       lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
                   &lcp2_imbalances, &rebalance_candidates);
  
-       lcp2_allocate_unassigned(ctdb, nodemap, all_ips, lcp2_imbalances);
+       lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
  
         /* If we don't want IPs to fail back then don't rebalance IPs. */
         if (1 == ctdb->tunable.no_ip_failback) {
                 goto finished;
         }
  
+       /* It is only worth continuing if we have suitable target
+        * nodes to transfer IPs to.  This check is much cheaper than
+        * continuing on...
+        */
+       numnodes = talloc_array_length(ipflags);
+       num_rebalance_candidates = 0;
+       for (i=0; i<numnodes; i++) {
+               if (rebalance_candidates[i]) {
+                       num_rebalance_candidates++;
+               }
+       }
+       if (num_rebalance_candidates == 0) {
+               goto finished;
+       }
+
         /* Now, try to make sure the ip adresses are evenly distributed
            across the nodes.
         */
-       lcp2_failback(ctdb, nodemap, all_ips,
+       lcp2_failback(ctdb, ipflags, all_ips,
                       lcp2_imbalances, rebalance_candidates);
  
  finished:
@@ -2182,23 +2219,10 @@ static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
  
  /* The calculation part of the IP allocation algorithm. */
  static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
-                                  struct ctdb_node_map *nodemap,
-                                  struct ctdb_public_ip_list **all_ips_p)
+                                  struct ctdb_ipflags *ipflags,
+                                  struct ctdb_public_ip_list **all_ips_p,
+                                  uint32_t *force_rebalance_nodes)
  {
-       uint32_t mask;
-
-       /* If we have healthy nodes then we will only consider them
-          for serving public addresses
-       */
-       mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
-       if (all_nodes_are_disabled(nodemap) &&
-           (ctdb->tunable.no_ip_host_on_all_disabled == 0)) {
-               /* We didnt have any completely healthy nodes so
-                  use "disabled" nodes as a fallback
-               */
-               mask = NODE_FLAGS_INACTIVE;
-       }
-
         /* since nodes only know about those public addresses that
            can be served by that particular node, no single node has
            a full list of all public addresses that exist in the cluster.
@@ -2210,11 +2234,11 @@ static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
         *all_ips_p = create_merged_ip_list(ctdb);
  
          if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
-               ip_alloc_lcp2(ctdb, nodemap, *all_ips_p, mask);
+               ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
         } else if (1 == ctdb->tunable.deterministic_public_ips) {
-               ip_alloc_deterministic_ips(ctdb, nodemap, *all_ips_p);
+               ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
         } else {
-               ip_alloc_nondeterministic_ips(ctdb, nodemap, *all_ips_p);
+               ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
         }
  
         /* at this point ->pnn is the node which will own each IP
@@ -2227,6 +2251,7 @@ static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
  struct get_tunable_callback_data {
         const char *tunable;
         uint32_t *out;
+       bool fatal;
  };
  
  static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
@@ -2238,9 +2263,7 @@ static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
         int size;
  
         if (res != 0) {
-               DEBUG(DEBUG_ERR,
-                     ("Failure to read \"%s\" tunable from remote node %d\n",
-                      cd->tunable, pnn));
+               /* Already handled in fail callback */
                 return;
         }
  
@@ -2248,10 +2271,11 @@ static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
                 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
                                  cd->tunable, pnn, (int)sizeof(uint32_t),
                                  (int)outdata.dsize));
+               cd->fatal = true;
                 return;
         }
  
-       size = talloc_get_size(cd->out) / sizeof(uint32_t);
+       size = talloc_array_length(cd->out);
         if (pnn >= size) {
                 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
                                  cd->tunable, pnn, size));
@@ -2262,21 +2286,56 @@ static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
         cd->out[pnn] = *(uint32_t *)outdata.dptr;
  }
  
+static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                      int32_t res, TDB_DATA outdata,
+                                      void *callback)
+{
+       struct get_tunable_callback_data *cd =
+               (struct get_tunable_callback_data *)callback;
+
+       switch (res) {
+       case -ETIME:
+               DEBUG(DEBUG_ERR,
+                     ("Timed out getting tunable \"%s\" from node %d\n",
+                      cd->tunable, pnn));
+               cd->fatal = true;
+               break;
+       case -EINVAL:
+       case -1:
+               DEBUG(DEBUG_WARNING,
+                     ("Tunable \"%s\" not implemented on node %d\n",
+                      cd->tunable, pnn));
+               break;
+       default:
+               DEBUG(DEBUG_ERR,
+                     ("Unexpected error getting tunable \"%s\" from node %d\n",
+                      cd->tunable, pnn));
+               cd->fatal = true;
+       }
+}
+
  static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
                                         TALLOC_CTX *tmp_ctx,
                                         struct ctdb_node_map *nodemap,
-                                       const char *tunable)
+                                       const char *tunable,
+                                       uint32_t default_value)
  {
         TDB_DATA data;
         struct ctdb_control_get_tunable *t;
         uint32_t *nodes;
         uint32_t *tvals;
         struct get_tunable_callback_data callback_data;
+       int i;
  
-       tvals = talloc_zero_array(tmp_ctx, uint32_t, nodemap->num);
+       tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
         CTDB_NO_MEMORY_NULL(ctdb, tvals);
+       for (i=0; i<nodemap->num; i++) {
+               tvals[i] = default_value;
+       }
+               
         callback_data.out = tvals;
         callback_data.tunable = tunable;
+       callback_data.fatal = false;
  
         data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
         data.dptr  = talloc_size(tmp_ctx, data.dsize);
@@ -2287,9 +2346,13 @@ static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
                                       nodes, 0, TAKEOVER_TIMEOUT(),
                                       false, data,
-                                     get_tunable_callback, NULL,
+                                     get_tunable_callback,
+                                     get_tunable_fail_callback,
                                       &callback_data) != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to get %s tunable failed\n", tunable));
+               if (callback_data.fatal) {
+                       talloc_free(tvals);
+                       tvals = NULL;
+               }
         }
         talloc_free(nodes);
         talloc_free(data.dptr);
@@ -2297,6 +2360,98 @@ static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
         return tvals;
  }
  
+struct get_runstate_callback_data {
+       enum ctdb_runstate *out;
+       bool fatal;
+};
+
+static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                 int32_t res, TDB_DATA outdata,
+                                 void *callback_data)
+{
+       struct get_runstate_callback_data *cd =
+               (struct get_runstate_callback_data *)callback_data;
+       int size;
+
+       if (res != 0) {
+               /* Already handled in fail callback */
+               return;
+       }
+
+       if (outdata.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
+                                pnn, (int)sizeof(uint32_t),
+                                (int)outdata.dsize));
+               cd->fatal = true;
+               return;
+       }
+
+       size = talloc_array_length(cd->out);
+       if (pnn >= size) {
+               DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
+                                pnn, size));
+               return;
+       }
+
+       cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
+}
+
+static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                      int32_t res, TDB_DATA outdata,
+                                      void *callback)
+{
+       struct get_runstate_callback_data *cd =
+               (struct get_runstate_callback_data *)callback;
+
+       switch (res) {
+       case -ETIME:
+               DEBUG(DEBUG_ERR,
+                     ("Timed out getting runstate from node %d\n", pnn));
+               cd->fatal = true;
+               break;
+       default:
+               DEBUG(DEBUG_WARNING,
+                     ("Error getting runstate from node %d - assuming runstates not supported\n",
+                      pnn));
+       }
+}
+
+static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
+                                                   TALLOC_CTX *tmp_ctx,
+                                                   struct ctdb_node_map *nodemap,
+                                                   enum ctdb_runstate default_value)
+{
+       uint32_t *nodes;
+       enum ctdb_runstate *rs;
+       struct get_runstate_callback_data callback_data;
+       int i;
+
+       rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
+       CTDB_NO_MEMORY_NULL(ctdb, rs);
+       for (i=0; i<nodemap->num; i++) {
+               rs[i] = default_value;
+       }
+
+       callback_data.out = rs;
+       callback_data.fatal = false;
+
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
+                                     nodes, 0, TAKEOVER_TIMEOUT(),
+                                     true, tdb_null,
+                                     get_runstate_callback,
+                                     get_runstate_fail_callback,
+                                     &callback_data) != 0) {
+               if (callback_data.fatal) {
+                       free(rs);
+                       rs = NULL;
+               }
+       }
+       talloc_free(nodes);
+
+       return rs;
+}
+
  /* Set internal flags for IP allocation:
   *   Clear ip flags
   *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
@@ -2306,27 +2461,35 @@ static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
   *   else
   *     Set NOIPHOST ip flags for disabled nodes
   */
-static void set_ipflags_internal(struct ctdb_node_map *nodemap,
-                                uint32_t *tval_noiptakeover,
-                                uint32_t *tval_noiphostonalldisabled)
+static struct ctdb_ipflags *
+set_ipflags_internal(struct ctdb_context *ctdb,
+                    TALLOC_CTX *tmp_ctx,
+                    struct ctdb_node_map *nodemap,
+                    uint32_t *tval_noiptakeover,
+                    uint32_t *tval_noiphostonalldisabled,
+                    enum ctdb_runstate *runstate)
  {
         int i;
+       struct ctdb_ipflags *ipflags;
  
-       /* Clear IP flags */
-       for (i=0;i<nodemap->num;i++) {
-               nodemap->nodes[i].flags &=
-                       ~(NODE_FLAGS_NOIPTAKEOVER|NODE_FLAGS_NOIPHOST);
-       }
+       /* Clear IP flags - implicit due to talloc_zero */
+       ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
+       CTDB_NO_MEMORY_NULL(ctdb, ipflags);
  
         for (i=0;i<nodemap->num;i++) {
                 /* Can not take IPs on node with NoIPTakeover set */
                 if (tval_noiptakeover[i] != 0) {
-                       nodemap->nodes[i].flags |= NODE_FLAGS_NOIPTAKEOVER;
+                       ipflags[i].noiptakeover = true;
                 }
  
+               /* Can not host IPs on node not in RUNNING state */
+               if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
+                       ipflags[i].noiphost = true;
+                       continue;
+               }
                 /* Can not host IPs on INACTIVE node */
                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
-                       nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
+                       ipflags[i].noiphost = true;
                 }
         }
  
@@ -2336,7 +2499,7 @@ static void set_ipflags_internal(struct ctdb_node_map *nodemap,
                  */
                 for (i=0;i<nodemap->num;i++) {
                         if (tval_noiphostonalldisabled[i] != 0) {
-                               nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
+                               ipflags[i].noiphost = true;
                         }
                 }
         } else {
@@ -2345,48 +2508,165 @@ static void set_ipflags_internal(struct ctdb_node_map *nodemap,
                  */
                 for (i=0;i<nodemap->num;i++) {
                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
-                               nodemap->nodes[i].flags |= NODE_FLAGS_NOIPHOST;
+                               ipflags[i].noiphost = true;
                         }
                 }
         }
+
+       return ipflags;
  }
  
-static bool set_ipflags(struct ctdb_context *ctdb,
-                       TALLOC_CTX *tmp_ctx,
-                       struct ctdb_node_map *nodemap)
+static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
+                                       TALLOC_CTX *tmp_ctx,
+                                       struct ctdb_node_map *nodemap)
  {
         uint32_t *tval_noiptakeover;
         uint32_t *tval_noiphostonalldisabled;
+       struct ctdb_ipflags *ipflags;
+       enum ctdb_runstate *runstate;
+
  
         tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
-                                                  "NoIPTakeover");
+                                                  "NoIPTakeover", 0);
         if (tval_noiptakeover == NULL) {
-               return false;
+               return NULL;
         }
  
         tval_noiphostonalldisabled =
                 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
-                                      "NoIPHostOnAllDisabled");
+                                      "NoIPHostOnAllDisabled", 0);
         if (tval_noiphostonalldisabled == NULL) {
-               return false;
+               /* Caller frees tmp_ctx */
+               return NULL;
+       }
+
+       /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
+        * will default to CTDB_RUNSTATE_RUNNING.  This ensures
+        * reasonable behaviour on a mixed cluster during upgrade.
+        */
+       runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
+                                          CTDB_RUNSTATE_RUNNING);
+       if (runstate == NULL) {
+               /* Caller frees tmp_ctx */
+               return NULL;
         }
  
-       set_ipflags_internal(nodemap,
-                            tval_noiptakeover, tval_noiphostonalldisabled);
+       ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
+                                      tval_noiptakeover,
+                                      tval_noiphostonalldisabled,
+                                      runstate);
  
         talloc_free(tval_noiptakeover);
         talloc_free(tval_noiphostonalldisabled);
+       talloc_free(runstate);
+
+       return ipflags;
+}
+
+struct iprealloc_callback_data {
+       bool *retry_nodes;
+       int retry_count;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                       int32_t res, TDB_DATA outdata,
+                                       void *callback)
+{
+       int numnodes;
+       struct iprealloc_callback_data *cd =
+               (struct iprealloc_callback_data *)callback;
+
+       numnodes = talloc_array_length(cd->retry_nodes);
+       if (pnn > numnodes) {
+               DEBUG(DEBUG_ERR,
+                     ("ipreallocated failure from node %d, "
+                      "but only %d nodes in nodemap\n",
+                      pnn, numnodes));
+               return;
+       }
+
+       /* Can't run the "ipreallocated" event on a INACTIVE node */
+       if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
+               DEBUG(DEBUG_WARNING,
+                     ("ipreallocated failed on inactive node %d, ignoring\n",
+                      pnn));
+               return;
+       }
+
+       switch (res) {
+       case -ETIME:
+               /* If the control timed out then that's a real error,
+                * so call the real fail callback
+                */
+               if (cd->fail_callback) {
+                       cd->fail_callback(ctdb, pnn, res, outdata,
+                                         cd->fail_callback_data);
+               } else {
+                       DEBUG(DEBUG_WARNING,
+                             ("iprealloc timed out but no callback registered\n"));
+               }
+               break;
+       default:
+               /* If not a timeout then either the ipreallocated
+                * eventscript (or some setup) failed.  This might
+                * have failed because the IPREALLOCATED control isn't
+                * implemented - right now there is no way of knowing
+                * because the error codes are all folded down to -1.
+                * Consider retrying using EVENTSCRIPT control...
+                */
+               DEBUG(DEBUG_WARNING,
+                     ("ipreallocated failure from node %d, flagging retry\n",
+                      pnn));
+               cd->retry_nodes[pnn] = true;
+               cd->retry_count++;
+       }
+}
  
-       return true;
+struct takeover_callback_data {
+       bool *node_failed;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void takeover_run_fail_callback(struct ctdb_context *ctdb,
+                                      uint32_t node_pnn, int32_t res,
+                                      TDB_DATA outdata, void *callback_data)
+{
+       struct takeover_callback_data *cd =
+               talloc_get_type_abort(callback_data,
+                                     struct takeover_callback_data);
+       int i;
+
+       for (i = 0; i < cd->nodemap->num; i++) {
+               if (node_pnn == cd->nodemap->nodes[i].pnn) {
+                       break;
+               }
+       }
+
+       if (i == cd->nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
+               return;
+       }
+
+       if (!cd->node_failed[i]) {
+               cd->node_failed[i] = true;
+               cd->fail_callback(ctdb, node_pnn, res, outdata,
+                                 cd->fail_callback_data);
+       }
  }
  
  /*
    make any IP alias changes for public addresses that are necessary 
   */
  int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+                     uint32_t *force_rebalance_nodes,
                       client_async_callback fail_callback, void *callback_data)
  {
-       int i;
+       int i, j, ret;
         struct ctdb_public_ip ip;
         struct ctdb_public_ipv4 ipv4;
         uint32_t *nodes;
@@ -2396,7 +2676,10 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
         struct client_async_data *async_data;
         struct ctdb_client_control_state *state;
         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
-       uint32_t disable_timeout;
+       struct ctdb_ipflags *ipflags;
+       struct takeover_callback_data *takeover_data;
+       struct iprealloc_callback_data iprealloc_data;
+       bool *retry_data;
  
         /*
          * ip failover is completely disabled, just send out the 
@@ -2406,37 +2689,37 @@ int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
                 goto ipreallocated;
         }
  
-       if (!set_ipflags(ctdb, tmp_ctx, nodemap)) {
-               DEBUG(DEBUG_ERR,("Failed to set IP flags from tunables\n"));
+       ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
+       if (ipflags == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
+               talloc_free(tmp_ctx);
                 return -1;
         }
  
         ZERO_STRUCT(ip);
  
         /* Do the IP reassignment calculations */
-       ctdb_takeover_run_core(ctdb, nodemap, &all_ips);
+       ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
  
-       /* The recovery daemon does regular sanity checks of the IPs.
-        * However, sometimes it is overzealous and thinks changes are
-        * required when they're already underway.  This stops the
-        * checks for a while before we start moving IPs.
+       /* Now tell all nodes to release any public IPs should not
+        * host.  This will be a NOOP on nodes that don't currently
+        * hold the given IP.
          */
-       disable_timeout = ctdb->tunable.takeover_timeout;
-       data.dptr  = (uint8_t*)&disable_timeout;
-       data.dsize = sizeof(disable_timeout);
-       if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
-                                    CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
-               DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
-       }
+       takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
+
+       takeover_data->node_failed = talloc_zero_array(tmp_ctx,
+                                                      bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
+       takeover_data->fail_callback = fail_callback;
+       takeover_data->fail_callback_data = callback_data;
+       takeover_data->nodemap = nodemap;
  
-       /* now tell all nodes to delete any alias that they should not
-          have.  This will be a NOOP on nodes that don't currently
-          hold the given alias */
         async_data = talloc_zero(tmp_ctx, struct client_async_data);
         CTDB_NO_MEMORY_FATAL(ctdb, async_data);
  
-       async_data->fail_callback = fail_callback;
-       async_data->callback_data = callback_data;
+       async_data->fail_callback = takeover_run_fail_callback;
+       async_data->callback_data = takeover_data;
  
         for (i=0;i<nodemap->num;i++) {
                 /* don't talk to unconnected nodes, but do talk to banned nodes */
@@ -2550,17 +2833,58 @@ ipreallocated:
          * IPs have moved.  Once upon a time this event only used to
          * update natwg.
          */
+       retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
+       iprealloc_data.retry_nodes = retry_data;
+       iprealloc_data.retry_count = 0;
+       iprealloc_data.fail_callback = fail_callback;
+       iprealloc_data.fail_callback_data = callback_data;
+       iprealloc_data.nodemap = nodemap;
+
         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
-       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
-                                     nodes, 0, TAKEOVER_TIMEOUT(),
-                                     false, tdb_null,
-                                     NULL, fail_callback,
-                                     callback_data) != 0) {
-               DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
+       ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
+                                       nodes, 0, TAKEOVER_TIMEOUT(),
+                                       false, tdb_null,
+                                       NULL, iprealloc_fail_callback,
+                                       &iprealloc_data);
+       if (ret != 0) {
+               /* If the control failed then we should retry to any
+                * nodes flagged by iprealloc_fail_callback using the
+                * EVENTSCRIPT control.  This is a best-effort at
+                * backward compatiblity when running a mixed cluster
+                * where some nodes have not yet been upgraded to
+                * support the IPREALLOCATED control.
+                */
+               DEBUG(DEBUG_WARNING,
+                     ("Retry ipreallocated to some nodes using eventscript control\n"));
+
+               nodes = talloc_array(tmp_ctx, uint32_t,
+                                    iprealloc_data.retry_count);
+               CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+               j = 0;
+               for (i=0; i<nodemap->num; i++) {
+                       if (iprealloc_data.retry_nodes[i]) {
+                               nodes[j] = i;
+                               j++;
+                       }
+               }
+
+               data.dptr  = discard_const("ipreallocated");
+               data.dsize = strlen((char *)data.dptr) + 1; 
+               ret = ctdb_client_async_control(ctdb,
+                                               CTDB_CONTROL_RUN_EVENTSCRIPTS,
+                                               nodes, 0, TAKEOVER_TIMEOUT(),
+                                               false, data,
+                                               NULL, fail_callback,
+                                               callback_data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
+               }
         }
  
         talloc_free(tmp_ctx);
-       return 0;
+       return ret;
  }
  
  
@@ -2599,6 +2923,11 @@ int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
         struct ctdb_vnn *vnn;
         ctdb_sock_addr addr;
  
+       /* If we don't have public IPs, tickles are useless */
+       if (ctdb->vnn == NULL) {
+               return 0;
+       }
+
         switch (indata.dsize) {
         case sizeof(struct ctdb_control_tcp):
                 old_addr = (struct ctdb_control_tcp *)indata.dptr;
@@ -2743,6 +3072,11 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tc
         struct ctdb_tcp_connection tcp;
         struct ctdb_vnn *vnn;
  
+       /* If we don't have public IPs, tickles are useless */
+       if (ctdb->vnn == NULL) {
+               return 0;
+       }
+
         vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
         if (vnn == NULL) {
                 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
@@ -2756,9 +3090,7 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tc
  
         /* If this is the first tickle */
         if (tcparray == NULL) {
-               tcparray = talloc_size(ctdb->nodes, 
-                       offsetof(struct ctdb_tcp_array, connections) +
-                       sizeof(struct ctdb_tcp_connection) * 1);
+               tcparray = talloc(vnn, struct ctdb_tcp_array);
                 CTDB_NO_MEMORY(ctdb, tcparray);
                 vnn->tcp_array = tcparray;
  
@@ -2780,7 +3112,7 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tc
         /* Do we already have this tickle ?*/
         tcp.src_addr = p->src_addr;
         tcp.dst_addr = p->dst_addr;
-       if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
+       if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
                 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
                         ctdb_addr_to_str(&tcp.dst_addr),
                         ntohs(tcp.dst_addr.ip.sin_port),
@@ -2794,11 +3126,10 @@ int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tc
                                         tcparray->num+1);
         CTDB_NO_MEMORY(ctdb, tcparray->connections);
  
-       vnn->tcp_array = tcparray;
         tcparray->connections[tcparray->num].src_addr = p->src_addr;
         tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
         tcparray->num++;
-                               
+
         DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
                 ctdb_addr_to_str(&tcp.dst_addr),
                 ntohs(tcp.dst_addr.ip.sin_port),
@@ -2883,6 +3214,11 @@ int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
  {
         struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
  
+       /* If we don't have public IPs, tickles are useless */
+       if (ctdb->vnn == NULL) {
+               return 0;
+       }
+
         ctdb_remove_tcp_connection(ctdb, conn);
  
         return 0;
@@ -2890,12 +3226,20 @@ int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
  
  
  /*
-  called when a daemon restarts - send all tickes for all public addresses
-  we are serving immediately to the new node.
+  Called when another daemon starts - caises all tickles for all
+  public addresses we are serving to be sent to the new node on the
+  next check.  This actually causes the next scheduled call to
+  tdb_update_tcp_tickles() to update all nodes.  This is simple and
+  doesn't require careful error handling.
   */
-int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
+int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
  {
-/*XXX here we should send all tickes we are serving to the new node */
+       struct ctdb_vnn *vnn;
+
+       for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
+               vnn->tcp_update_needed = true;
+       }
+
         return 0;
  }
  
@@ -2920,6 +3264,7 @@ void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
  void ctdb_release_all_ips(struct ctdb_context *ctdb)
  {
         struct ctdb_vnn *vnn;
+       int count = 0;
  
         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
                 if (!ctdb_sys_have_ip(&vnn->public_address)) {
@@ -2929,13 +3274,22 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb)
                 if (!vnn->iface) {
                         continue;
                 }
+
+               DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits,
+                                   ctdb_vnn_iface_string(vnn)));
+
                 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
                                   ctdb_vnn_iface_string(vnn),
                                   ctdb_addr_to_str(&vnn->public_address),
                                   vnn->public_netmask_bits);
                 release_kill_clients(ctdb, &vnn->public_address);
                 ctdb_vnn_unassign_iface(ctdb, vnn);
+               count++;
         }
+
+       DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
  }
  
  
@@ -3081,7 +3435,7 @@ int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
                 if (vnn->iface == cur) {
                         info->active_idx = i;
                 }
-               strcpy(info->ifaces[i].name, cur->name);
+               strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
                 info->ifaces[i].link_state = cur->link_up;
                 info->ifaces[i].references = cur->references;
         }
@@ -3548,11 +3902,11 @@ int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA ind
                                  * list->tickles.num) {
                 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
                 return -1;
-       }       
+       }
  
         vnn = find_public_ip_vnn(ctdb, &list->addr);
         if (vnn == NULL) {
-               DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
+               DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
                         ctdb_addr_to_str(&list->addr)));
  
                 return 1;
@@ -3562,7 +3916,7 @@ int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA ind
         talloc_free(vnn->tcp_array);
         vnn->tcp_array = NULL;
  
-       tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
+       tcparray = talloc(vnn, struct ctdb_tcp_array);
         CTDB_NO_MEMORY(ctdb, tcparray);
  
         tcparray->num = list->tickles.num;
@@ -3570,12 +3924,12 @@ int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA ind
         tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
         CTDB_NO_MEMORY(ctdb, tcparray->connections);
  
-       memcpy(tcparray->connections, &list->tickles.connections[0], 
+       memcpy(tcparray->connections, &list->tickles.connections[0],
                sizeof(struct ctdb_tcp_connection)*tcparray->num);
  
         /* We now have a new fresh tickle list array for this vnn */
-       vnn->tcp_array = talloc_steal(vnn, tcparray);
-       
+       vnn->tcp_array = tcparray;
+
         return 0;
  }
  
@@ -3628,10 +3982,9 @@ int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA ind
  /*
    set the list of all tcp tickles for a public address
   */
-static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
-                             struct timeval timeout, uint32_t destnode, 
-                             ctdb_sock_addr *addr,
-                             struct ctdb_tcp_array *tcparray)
+static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
+                                           ctdb_sock_addr *addr,
+                                           struct ctdb_tcp_array *tcparray)
  {
         int ret, num;
         TDB_DATA data;
@@ -3656,7 +4009,7 @@ static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
                 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
         }
  
-       ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+       ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
                                        CTDB_CONTROL_SET_TCP_TICKLE_LIST,
                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
         if (ret != 0) {
@@ -3692,14 +4045,14 @@ static void ctdb_update_tcp_tickles(struct event_context *ev,
                 if (!vnn->tcp_update_needed) {
                         continue;
                 }
-               ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
-                               TAKEOVER_TIMEOUT(),
-                               CTDB_BROADCAST_CONNECTED,
-                               &vnn->public_address,
-                               vnn->tcp_array);
+               ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
+                                                      &vnn->public_address,
+                                                      vnn->tcp_array);
                 if (ret != 0) {
                         DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
                                 ctdb_addr_to_str(&vnn->public_address)));
+               } else {
+                       vnn->tcp_update_needed = false;
                 }
         }
  
@@ -3823,6 +4176,8 @@ int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA inda
                 return -1;
         }
  
+       DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
         ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
  
         if (ret != 0) {
@@ -3833,20 +4188,32 @@ int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA inda
         return 0;
  }
  
+struct delete_ip_callback_state {
+       struct ctdb_req_control *c;
+};
+
  /*
    called when releaseip event finishes for del_public_address
   */
-static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
-                               void *private_data)
+static void delete_ip_callback(struct ctdb_context *ctdb,
+                              int32_t status, TDB_DATA data,
+                              const char *errormsg,
+                              void *private_data)
  {
+       struct delete_ip_callback_state *state =
+               talloc_get_type(private_data, struct delete_ip_callback_state);
+
+       /* If release failed then fail. */
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
         talloc_free(private_data);
  }
  
-int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
+                                       struct ctdb_req_control *c,
+                                       TDB_DATA indata, bool *async_reply)
  {
         struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
         struct ctdb_vnn *vnn;
-       int ret;
  
         /* verify the size of indata */
         if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
@@ -3864,41 +4231,68 @@ int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA inda
                 return -1;
         }
  
+       DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
         /* walk over all public addresses until we find a match */
         for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
                 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
-                       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
-
-                       DLIST_REMOVE(ctdb->vnn, vnn);
-                       talloc_steal(mem_ctx, vnn);
-                       ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
-                       if (vnn->pnn != ctdb->pnn) {
-                               if (vnn->iface != NULL) {
-                                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                       if (vnn->pnn == ctdb->pnn) {
+                               struct delete_ip_callback_state *state;
+                               struct ctdb_public_ip *ip;
+                               TDB_DATA data;
+                               int ret;
+
+                               vnn->delete_pending = true;
+
+                               state = talloc(ctdb,
+                                              struct delete_ip_callback_state);
+                               CTDB_NO_MEMORY(ctdb, state);
+                               state->c = c;
+
+                               ip = talloc(state, struct ctdb_public_ip);
+                               if (ip == NULL) {
+                                       DEBUG(DEBUG_ERR,
+                                             (__location__ " Out of memory\n"));
+                                       talloc_free(state);
+                                       return -1;
+                               }
+                               ip->pnn = -1;
+                               ip->addr = pub->addr;
+
+                               data.dsize = sizeof(struct ctdb_public_ip);
+                               data.dptr = (unsigned char *)ip;
+
+                               ret = ctdb_daemon_send_control(ctdb,
+                                                              ctdb_get_pnn(ctdb),
+                                                              0,
+                                                              CTDB_CONTROL_RELEASE_IP,
+                                                              0, 0,
+                                                              data,
+                                                              delete_ip_callback,
+                                                              state);
+                               if (ret == -1) {
+                                       DEBUG(DEBUG_ERR,
+                                             (__location__ "Unable to send "
+                                              "CTDB_CONTROL_RELEASE_IP\n"));
+                                       talloc_free(state);
+                                       return -1;
                                 }
-                               talloc_free(mem_ctx);
-                               return 0;
-                       }
-                       vnn->pnn = -1;
  
-                       ret = ctdb_event_script_callback(ctdb, 
-                                        mem_ctx, delete_ip_callback, mem_ctx,
-                                        false,
-                                        CTDB_EVENT_RELEASE_IP,
-                                        "%s %s %u",
-                                        ctdb_vnn_iface_string(vnn),
-                                        ctdb_addr_to_str(&vnn->public_address),
-                                        vnn->public_netmask_bits);
-                       if (vnn->iface != NULL) {
-                               ctdb_vnn_unassign_iface(ctdb, vnn);
-                       }
-                       if (ret != 0) {
-                               return -1;
+                               state->c = talloc_steal(state, c);
+                               *async_reply = true;
+                       } else {
+                               /* This IP is not hosted on the
+                                * current node so just delete it
+                                * now. */
+                               do_delete_ip(ctdb, vnn);
                         }
+
                         return 0;
                 }
         }
  
+       DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
+                        ctdb_addr_to_str(&pub->addr)));
         return -1;
  }
  
@@ -3941,7 +4335,7 @@ int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
  
         ret = ctdb_event_script_callback(ctdb, state,
                                          ctdb_ipreallocated_callback, state,
-                                        false, CTDB_EVENT_IPREALLOCATED,
+                                        CTDB_EVENT_IPREALLOCATED,
                                          "%s", "");
  
         if (ret != 0) {
@@ -3962,7 +4356,9 @@ int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
     node has the expected ip allocation.
     This is verified against ctdb->ip_tree
  */
-int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
+int verify_remote_ip_allocation(struct ctdb_context *ctdb,
+                               struct ctdb_all_public_ips *ips,
+                               uint32_t pnn)
  {
         struct ctdb_public_ip_list *tmp_ip; 
         int i;
@@ -3980,7 +4376,7 @@ int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_publi
         for (i=0; i<ips->num; i++) {
                 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
                 if (tmp_ip == NULL) {
-                       DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
+                       DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
                         return -1;
                 }
  
@@ -3989,7 +4385,11 @@ int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_publi
                 }
  
                 if (tmp_ip->pnn != ips->ips[i].pnn) {
-                       DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
+                       DEBUG(DEBUG_ERR,
+                             ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
+                              pnn,
+                              ctdb_addr_to_str(&ips->ips[i].addr),
+                              ips->ips[i].pnn, tmp_ip->pnn));
                         return -1;
                 }
         }
@@ -4073,89 +4473,173 @@ static int ctdb_reloadips_child(struct ctdb_context *ctdb)
         TALLOC_CTX *mem_ctx = talloc_new(NULL);
         struct ctdb_all_public_ips *ips;
         struct ctdb_vnn *vnn;
+       struct client_async_data *async_data;
+       struct timeval timeout;
+       TDB_DATA data;
+       struct ctdb_client_control_state *state;
+       bool first_add;
         int i, ret;
  
-       /* read the ip allocation from the local node */
-       ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
+       CTDB_NO_MEMORY(ctdb, mem_ctx);
+
+       /* Read IPs from local node */
+       ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
+                                      CTDB_CURRENT_NODE, mem_ctx, &ips);
         if (ret != 0) {
-               DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
+               DEBUG(DEBUG_ERR,
+                     ("Unable to fetch public IPs from local node\n"));
                 talloc_free(mem_ctx);
                 return -1;
         }
  
-       /* re-read the public ips file */
+       /* Read IPs file - this is safe since this is a child process */
         ctdb->vnn = NULL;
         if (ctdb_set_public_addresses(ctdb, false) != 0) {
                 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
                 talloc_free(mem_ctx);
                 return -1;
-       }               
+       }
  
+       async_data = talloc_zero(mem_ctx, struct client_async_data);
+       CTDB_NO_MEMORY(ctdb, async_data);
  
-       /* check the previous list of ips and scan for ips that have been
-          dropped.
-        */
+       /* Compare IPs between node and file for IPs to be deleted */
         for (i = 0; i < ips->num; i++) {
+               /* */
                 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
-                       if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
+                       if (ctdb_same_ip(&vnn->public_address,
+                                        &ips->ips[i].addr)) {
+                               /* IP is still in file */
                                 break;
                         }
                 }
  
-               /* we need to delete this ip, no longer available on this node */
                 if (vnn == NULL) {
-                       struct ctdb_control_ip_iface pub;
+                       /* Delete IP ips->ips[i] */
+                       struct ctdb_control_ip_iface *pub;
  
-                       DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
-                       pub.addr  = ips->ips[i].addr;
-                       pub.mask  = 0;
-                       pub.len   = 0;
+                       DEBUG(DEBUG_NOTICE,
+                             ("IP %s no longer configured, deleting it\n",
+                              ctdb_addr_to_str(&ips->ips[i].addr)));
  
-                       ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
-                       if (ret != 0) {
-                               DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
-                               return -1;
+                       pub = talloc_zero(mem_ctx,
+                                         struct ctdb_control_ip_iface);
+                       CTDB_NO_MEMORY(ctdb, pub);
+
+                       pub->addr  = ips->ips[i].addr;
+                       pub->mask  = 0;
+                       pub->len   = 0;
+
+                       timeout = TAKEOVER_TIMEOUT();
+
+                       data.dsize = offsetof(struct ctdb_control_ip_iface,
+                                             iface) + pub->len;
+                       data.dptr = (uint8_t *)pub;
+
+                       state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+                                                 CTDB_CONTROL_DEL_PUBLIC_IP,
+                                                 0, data, async_data,
+                                                 &timeout, NULL);
+                       if (state == NULL) {
+                               DEBUG(DEBUG_ERR,
+                                     (__location__
+                                      " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
+                               goto failed;
                         }
+
+                       ctdb_client_async_add(async_data, state);
                 }
         }
  
-
-       /* loop over all new ones and check the ones we need to add */
+       /* Compare IPs between node and file for IPs to be added */
+       first_add = true;
         for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
                 for (i = 0; i < ips->num; i++) {
-                       if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
+                       if (ctdb_same_ip(&vnn->public_address,
+                                        &ips->ips[i].addr)) {
+                               /* IP already on node */
                                 break;
                         }
                 }
                 if (i == ips->num) {
-                       struct ctdb_control_ip_iface pub;
+                       /* Add IP ips->ips[i] */
+                       struct ctdb_control_ip_iface *pub;
                         const char *ifaces = NULL;
+                       uint32_t len;
                         int iface = 0;
  
-                       DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
-
-                       pub.addr  = vnn->public_address;
-                       pub.mask  = vnn->public_netmask_bits;
+                       DEBUG(DEBUG_NOTICE,
+                             ("New IP %s configured, adding it\n",
+                              ctdb_addr_to_str(&vnn->public_address)));
+                       if (first_add) {
+                               uint32_t pnn = ctdb_get_pnn(ctdb);
+
+                               data.dsize = sizeof(pnn);
+                               data.dptr  = (uint8_t *)&pnn;
+
+                               ret = ctdb_client_send_message(
+                                       ctdb,
+                                       CTDB_BROADCAST_CONNECTED,
+                                       CTDB_SRVID_REBALANCE_NODE,
+                                       data);
+                               if (ret != 0) {
+                                       DEBUG(DEBUG_WARNING,
+                                             ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
+                               }
  
+                               first_add = false;
+                       }
  
                         ifaces = vnn->ifaces[0];
                         iface = 1;
                         while (vnn->ifaces[iface] != NULL) {
-                               ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
+                               ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
+                                                        vnn->ifaces[iface]);
                                 iface++;
                         }
-                       pub.len   = strlen(ifaces)+1;
-                       memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
  
-                       ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
-                       if (ret != 0) {
-                               DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
-                               return -1;
+                       len   = strlen(ifaces) + 1;
+                       pub = talloc_zero_size(mem_ctx,
+                                              offsetof(struct ctdb_control_ip_iface, iface) + len);
+                       CTDB_NO_MEMORY(ctdb, pub);
+
+                       pub->addr  = vnn->public_address;
+                       pub->mask  = vnn->public_netmask_bits;
+                       pub->len   = len;
+                       memcpy(&pub->iface[0], ifaces, pub->len);
+
+                       timeout = TAKEOVER_TIMEOUT();
+
+                       data.dsize = offsetof(struct ctdb_control_ip_iface,
+                                             iface) + pub->len;
+                       data.dptr = (uint8_t *)pub;
+
+                       state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+                                                 CTDB_CONTROL_ADD_PUBLIC_IP,
+                                                 0, data, async_data,
+                                                 &timeout, NULL);
+                       if (state == NULL) {
+                               DEBUG(DEBUG_ERR,
+                                     (__location__
+                                      " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
+                               goto failed;
                         }
+
+                       ctdb_client_async_add(async_data, state);
                 }
         }
  
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
+               goto failed;
+       }
+
+       talloc_free(mem_ctx);
         return 0;
+
+failed:
+       talloc_free(mem_ctx);
+       return -1;
  }
  
  /* This control is sent to force the node to re-read the public addresses file
@@ -4200,6 +4684,7 @@ int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_re
                 close(h->fd[0]);
                 debug_extra = talloc_asprintf(NULL, "reloadips:");
  
+               ctdb_set_process_name("ctdb_reloadips");
                 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
                         DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
                         res = -1;